def __init__(self, trip_date, dpath): service_dates = pd.DataFrame() for fi in effective_files(trip_date, dpath=dpath): try: with ZipFile(fi) as zf: raw_text = zf.read('calendar.txt') csvdata = StringIO(raw_text) service_dates = service_dates.append(pd.read_csv(csvdata)) except: print 'Error reading calendar from ' + fi service_dates.set_index('service_id', drop=True, inplace=True, verify_integrity=True) self.service_dates = service_dates service_exc = pd.DataFrame( columns=['service_id', 'date', 'exception_type']) for fi in effective_files(trip_date, dpath=dpath): try: with ZipFile(fi) as zf: raw_text = zf.read('calendar_dates.txt') csvdata = StringIO(raw_text) service_exc = service_exc.append( pd.read_csv(csvdata, dtype={ 0: str, 1: str, 2: int })) except: print 'Error reading calendar_dates from ' + fi # service_exc.set_index('date',drop=True,inplace=True) self.service_exc = service_exc
def get_data_famafrench(name): # path of zip files zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name) with urlopen(zip_file_path) as url: raw = url.read() with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.open(zf.namelist()[0]).readlines() line_lengths = np.array(lmap(len, data)) file_edges = np.where(line_lengths == 2)[0] datasets = {} edges = zip(file_edges + 1, file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: ncol_raw = np.array(lmap(len, dataset)) ncol = np.median(ncol_raw) header_index = np.where(ncol_raw == ncol - 1)[0][-1] header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique header = [ '{0} {1}'.format(j, hj) for j, hj in enumerate(header, start=1) ] index = np.array([d[0] for d in ds_header], dtype=int) dataset = np.array([d[1:] for d in ds_header], dtype=float) datasets[i] = DataFrame(dataset, index, columns=header) return datasets
def get_bookcrossing_data(local_file=None, get_ratings=True, get_users=False, get_books=False): if not local_file: # downloading data from requests import get zip_file_url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip' zip_response = get(zip_file_url) zip_contents = BytesIO(zip_response.content) else: zip_contents = local_file ratings = users = books = None with ZipFile(zip_contents) as zfile: zip_files = pd.Series(zfile.namelist()) zip_file = zip_files[zip_files.str.contains('ratings', flags=2)].iat[0] delimiter = ';' if get_ratings: zdata = zfile.read(zip_file) ratings = pd.read_csv(BytesIO(zdata), sep=delimiter, header=0, engine='c', encoding='unicode_escape') if get_users: zip_file = zip_files[zip_files.str.contains('users', flags=2)].iat[0] with zfile.open(zip_file) as zdata: users = pd.read_csv(zdata, sep=delimiter, header=0, engine='c', encoding='unicode_escape') if get_books: zip_file = zip_files[zip_files.str.contains('books', flags=2)].iat[0] with zfile.open(zip_file) as zdata: books = pd.read_csv( zdata, sep=delimiter, header=0, engine='c', quoting=1, escapechar='\\', encoding='unicode_escape', usecols=['ISBN', 'Book-Author', 'Publisher']) res = [ data.rename(columns=lambda x: x.lower().replace('book-', '').replace( '-id', 'id'), copy=False) for data in [ratings, users, books] if data is not None ] if len(res) == 1: res = res[0] return res
def get_movielens_data(local_file=None, get_genres=False): '''Downloads movielens data and stores it in pandas dataframe. ''' if not local_file: #print 'Downloading data...' zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip' zip_response = get(zip_file_url) zip_contents = StringIO(zip_response.content) #print 'Done.' else: zip_contents = local_file #print 'Loading data into memory...' with ZipFile(zip_contents) as zfile: zip_files = pd.Series(zfile.namelist()) zip_file = zip_files[zip_files.str.endswith('ratings.dat')].iat[0] zdata = zfile.read(zip_file) delimiter = ';' zdata = zdata.replace('::', delimiter) # makes data compatible with pandas c-engine ml_data = pd.read_csv(StringIO(zdata), sep=delimiter, header=None, engine='c', names=['userid', 'movieid', 'rating', 'timestamp'], usecols=['userid', 'movieid', 'rating']) if get_genres: zip_file = zip_files[zip_files.str.endswith('movies.dat')].iat[0] with zfile.open(zip_file) as zdata: delimiter = '::' genres_data = pd.read_csv(zdata, sep=delimiter, header=None, engine='python', names=['movieid', 'movienm', 'genres']) ml_genres = split_genres(genres_data) ml_data = (ml_data, ml_genres) return ml_data
def _read_zipfile(self, url): zipf = BytesIO(self._get_response(url).content) with ZipFile(zipf, 'r') as zf: data = zf.open(zf.namelist()[0]).read().decode() return data
def _read_zipfile(self, url): raw = self._get_response(url).content with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.open(zf.namelist()[0]).read().decode() return data
def _download_data_famafrench(name): url = ''.join([_URL, _URL_PREFIX, name, _URL_SUFFIX]) with urlopen(url) as socket: raw = socket.read() with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.open(zf.namelist()[0]).read().decode() return data
def _read_zipfile(self, ftppath): zipf = BytesIO() try: self._sec_ftp_session.retrbinary('RETR ' + ftppath, zipf.write) except EOFError: raise RemoteDataError('FTP server has closed the connection.') zipf.seek(0) with ZipFile(zipf, 'r') as zf: data = zf.open(zf.namelist()[0]).read().decode() return StringIO(data)
def _read_one_month(self, symbol, year, month): symbol = symbol.replace("/", "").upper() filename_cache = os.path.join(self.cache_directory, self._filename(symbol, year, month, '.zip')) zip_data, from_file_cache = self._get(symbol, year, month, filename_cache, 'bytes') with ZipFile(zip_data, 'r') as zf: zfile = zf.open(self._filename(symbol, year, month, '.csv')) df = pd.read_csv(zfile, names=['Symbol', 'Date', 'Bid', 'Ask']) df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d %H:%M:%S.%f') df = df.set_index('Date') return df
def load_trips(trip_date, dpath): # dpath argument is the path to the directory containing subdirectories trips = pd.DataFrame() for fi in effective_files(trip_date, dpath=dpath): try: with ZipFile(fi) as zf: raw_text = zf.read('trips.txt') csvdata = StringIO(raw_text) trips = trips.append(pd.read_csv(csvdata)) except: print 'Error reading from ' + fi trips.set_index('trip_id', drop=True, inplace=True, verify_integrity=True) return trips
def main(): expire_after = timedelta(days=1) if PY2: filename = 'cache_py2' else: filename = 'cache' session = requests_cache.CachedSession(cache_name=filename, expire_after=expire_after) dt = pd.to_datetime("2014-01-01") symbol = "AUD/USD" symbol = symbol.replace("/", "").upper() year = dt.year month = dt.month month_name = datetime.datetime(year=1970, month=month, day=1).strftime('%B').upper() #url = "http://www.truefx.com/dev/data/2014/JANUARY-2014/AUDUSD-2014-01.zip" url = "http://www.truefx.com/dev/data/{year:04d}/{month_name}-{year:04d}/{symbol}-{year:04d}-{month:02d}.zip".format( year=year, month=month, symbol=symbol, month_name=month_name) response = session.get(url) zip_data = BytesIO(response.content) filename = "{symbol}-{year:04d}-{month:02d}.csv".format(year=year, month=month, symbol=symbol) with ZipFile(zip_data, 'r') as zf: #filename = zf.namelist()[0] zfile = zf.open(filename) #print(zfile) #(symb, dt, ask, bid) = zfile.read().split(',') #print(zfile.__dict__) data = zfile.readlines() #df = pd.read_csv(zfile._fileobj) # ToFix: can't make it work correctly #return df = pd.DataFrame(data) #df = df[:100] # just for test df[0] = df[0].str.decode('utf8') df[0] = df[0].str.replace('\n', '') df[0] = df[0].map(lambda s: s.split(',')) df['Symbol'] = df[0].map(lambda t: t[0]) df['Date'] = df[0].map(lambda t: pd.to_datetime(t[1])) df['Bid'] = df[0].map(lambda t: t[2]).astype(float) df['Ask'] = df[0].map(lambda t: t[3]).astype(float) del df[0] df = df.set_index('Date') print(df)
def get_bx_data(file_path, get_ratings=True, get_users=False, get_books=False): ratings = users = books = None with ZipFile(file_path) as zfile: zip_files = pd.Series(zfile.namelist()) zip_file = zip_files[zip_files.str.contains('ratings', flags=2)].iat[0] delimiter = ';' if get_ratings: zdata = zfile.read(zip_file) ratings = pd.read_csv(StringIO(zdata), sep=delimiter, header=0, engine='c') if get_users: zip_file = zip_files[zip_files.str.contains('users', flags=2)].iat[0] with zfile.open(zip_file) as zdata: users = pd.read_csv( zdata, sep=delimiter, header=0, engine='c', ) if get_books: zip_file = zip_files[zip_files.str.contains('books', flags=2)].iat[0] with zfile.open(zip_file) as zdata: books = pd.read_csv( zdata, sep=delimiter, header=0, engine='c', quoting=1, escapechar='\\', usecols=['ISBN', 'Book-Author', 'Publisher']) res = [ data.rename(columns=lambda x: x.lower().replace('book-', '').replace( '-id', 'id')) for data in [ratings, users, books] if data is not None ] return res
def get_list(pathName=default_zip_file, input_filters_dict={}): ''' Read the entire database of fonts to find out what unique entries are available. Parameters --------------- pathName : the path of the zip file containing the database of characters input_filters_dict : a dictionary containing columns in the .csv file to be extracted. keys = column heading, values = value to be allowed in that column. Returns an entire column if a key is not provided for it. Returns -------------- a dataframe of all the all the unique lines in the dataset. Example: -------------- print(ocr_utils.get_list(columns=('font','fontVariant'))) ''' # speed up list if only the font list is needed try: if (len(input_filters_dict) == 1) and (len(input_filters_dict['font']) == 0): with ZipFile(pathName, 'r') as myzip: y = sorted(myzip.namelist()) for i, l in enumerate(y): y[i] = [l.replace('.csv', '')] return y except: pass df = read_file(pathName, input_filters_dict) df = df.loc[:, :'r0c0'] keys = list(input_filters_dict.keys()) df = df[keys] df = df.drop_duplicates() return df
def load_stops(trip_date, dpath, clean=True): # dpath argument is the path to the directory containing zips for each # agency or borough # clean=True eliminates duplicates that occur due to stops being shared by # multiple lines across GTFS files. Method will return an error if this is set # to false but duplicates exist stops = pd.DataFrame() for fi in effective_files(trip_date, dpath=dpath): try: with ZipFile(fi) as zf: raw_text = zf.read('stops.txt') times_df = pd.read_csv(StringIO(raw_text)) stops = stops.append(times_df) except: print 'Error reading from ' + fi if clean == True: stops.drop_duplicates(subset='stop_id', inplace=True) else: pass stops.set_index('stop_id', drop=True, inplace=True, verify_integrity=True) return stops
def load_stop_times(trip_date, dpath): # dpath argument is the path to the directory containing zips for each # agency or borough # NOTE: returns times as string dtype (HH:MM:SS) stop_times = pd.DataFrame() agency_df = pd.DataFrame() for fi in effective_files(trip_date, dpath=dpath): try: with ZipFile(fi) as zf: raw_text = zf.read('stop_times.txt') agency = zf.read('agency.txt') stop_times = stop_times.append(pd.read_csv(StringIO(raw_text))) agency_df = agency_df.append(pd.read_csv(StringIO(agency))) except: print 'Error reading from ' + fi stop_times.set_index(['trip_id', 'stop_id'], drop=True, inplace=True) tz_string = agency_df['agency_timezone'].unique() if len(tz_string) == 1: tz_string = tz_string[0] else: tz_string = 'error: zero or multiple zones' return stop_times, tz_string
def build_metadata(dpath): for root, dirs, files in os.walk(dpath): mdata = pd.DataFrame( columns=['file', 'min_eff', 'max_disc', 'file_date']) for fname in files: try: with ZipFile(root + '/' + fname) as zf: raw_text = zf.read('calendar.txt') fdatetime = zf.getinfo('calendar.txt').date_time fdate = (str(fdatetime[0]) + '-' + str(fdatetime[1]).zfill(2) + '-' + str(fdatetime[2]).zfill(2)) csvdata = pd.read_csv(StringIO(raw_text)) min_e = str(min(csvdata['start_date'])) min_eff = min_e[:4] + '-' + min_e[4:6] + '-' + min_e[6:8] max_d = str(max(csvdata['end_date'])) max_disc = max_d[:4] + '-' + max_d[4:6] + '-' + max_d[6:8] nrow = {'file':fname,'min_eff':min_eff,'max_disc':max_disc,\ 'file_date':fdate} mdata = mdata.append(nrow, ignore_index=True) except: print 'Error reading from ' + fname mdata.to_csv(root + '/metadata.txt', index=False)
def _get_one(self, name, *args, **kwargs): url = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/{name}.zip'\ .format(name=name) response = self.session.get(url) raw = response.content # returns bytes (.text returns unicode ; .content returns byte) if response.status_code != 200: raise IOError("Failed to get the data. Check that {0!r} is " "a valid FamaFrench dataset.".format(name)) with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.open(zf.namelist()[0]).readlines() line_lengths = np.array(lmap(len, data)) file_edges = np.where(line_lengths == 2)[0] datasets = {} edges = zip(file_edges + 1, file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: ncol_raw = np.array(lmap(len, dataset)) ncol = np.median(ncol_raw) header_index = np.where(ncol_raw == ncol - 1)[0][-1] header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique header = [ '{0} {1}'.format(j, hj) for j, hj in enumerate(header, start=1) ] index = np.array([d[0] for d in ds_header], dtype=int) dataset = np.array([d[1:] for d in ds_header], dtype=float) datasets[i] = pd.DataFrame(dataset, index, columns=header) return datasets
def get_data_famafrench(name): # path of zip files zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/' 'ken.french/ftp/') zip_file_path = '{0}{1}.zip'.format(zip_file_url, name) with urlopen(zip_file_path) as url: raw = url.read() with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.read(name + '.txt').splitlines() line_lengths = np.array(map(len, data)) file_edges = np.where(line_lengths)[0] datasets = {} edges = itertools.izip(file_edges[:-1], file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: ncol_raw = np.array(map(len, dataset)) ncol = np.median(ncol_raw) header_index = np.where(ncol_raw == ncol - 1)[0][-1] header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique header = [ '{0} {1}'.format(*items) for items in enumerate(header, start=1) ] index = np.fromiter((d[0] for d in ds_header), dtype=int) dataset = np.fromiter((d[1:] for d in ds_header), dtype=float) datasets[i] = DataFrame(dataset, index, columns=header) return datasets
def get_movielens_data(local_file=None, get_ratings=True, get_genres=False, split_genres=True, mdb_mapping=False, get_tags=False, include_time=False): '''Downloads movielens data and stores it in pandas dataframe. ''' fields = ['userid', 'movieid', 'rating'] if include_time: fields.append('timestamp') if not local_file: # downloading data from requests import get zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' zip_response = get(zip_file_url) zip_contents = BytesIO(zip_response.content) else: zip_contents = local_file ml_data = ml_genres = ml_tags = mapping = None # loading data into memory with ZipFile(zip_contents) as zfile: zip_files = pd.Series(zfile.namelist()) zip_file = zip_files[zip_files.str.contains('ratings')].iat[0] is_new_format = ('latest' in zip_file) or ('20m' in zip_file) delimiter = ',' header = 0 if is_new_format else None if get_ratings: zdata = zfile.read(zip_file) zdata = zdata.replace(b'::', delimiter.encode()) # makes data compatible with pandas c-engine # returns string objects instead of bytes in that case ml_data = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header, engine='c', names=fields, usecols=fields) if get_genres: zip_file = zip_files[zip_files.str.contains('movies')].iat[0] zdata = zfile.read(zip_file) if not is_new_format: # make data compatible with pandas c-engine # pandas returns string objects instead of bytes in that case delimiter = '^' zdata = zdata.replace(b'::', delimiter.encode()) genres_data = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header, engine='c', encoding='unicode_escape', names=['movieid', 'movienm', 'genres']) ml_genres = get_split_genres( genres_data) if split_genres else genres_data if get_tags: zip_file = zip_files[zip_files.str.contains('/tags')].iat[ 0] #not genome zdata = zfile.read(zip_file) if not is_new_format: # make data compatible with pandas c-engine # pandas returns string objects instead of bytes in that case delimiter = '^' zdata = zdata.replace(b'::', delimiter.encode()) fields[2] = 'tag' ml_tags = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header, engine='c', encoding='latin1', names=fields, usecols=range(len(fields))) if mdb_mapping and is_new_format: # imdb and tmdb mapping - exists only in ml-latest or 20m datasets zip_file = zip_files[zip_files.str.contains('links')].iat[0] with zfile.open(zip_file) as zdata: mapping = pd.read_csv(zdata, sep=',', header=0, engine='c', names=['movieid', 'imdbid', 'tmdbid']) res = [ data for data in [ml_data, ml_genres, ml_tags, mapping] if data is not None ] if len(res) == 1: res = res[0] return res
def get_data(geo, resolution, session): geo = conv_geo(geo) resolution = conv_resol(resolution) response = download_data(geo, resolution, session) logger.info("Request done") logger.info("Create stream file") zip_data = compat.BytesIO(response.content) logger.info("Creating a DataFrame per symbol") d = {} cols = None with ZipFile(zip_data, 'r') as zf: filelist = zf.filelist df_info = pd.DataFrame(filelist) df_info['filename'] = df_info[0].map(lambda x: x.filename) df_info['file_size'] = df_info[0].map(lambda x: x.file_size) df_info['date_time'] = df_info[0].map( lambda x: datetime.datetime(*x.date_time)) del df_info[0] for zinfo in filelist: filename = zinfo.filename filename_short, filename_ext = os.path.splitext(filename) with zf.open(filename) as zfile: if filename_ext.lower() == '.txt': file_exchange = filename.split('/')[3] file_symbol = os.path.split(filename_short)[-1].upper() logger.info( "Building DataFrame for '%s' at '%s' from '%s' (%.1f)" % (file_symbol, file_exchange, filename, float(zinfo.file_size) / 1024)) if zinfo.file_size > 0: try: if resolution == 'd': df = pd.read_csv(zfile, parse_dates=0) else: df = pd.read_csv(zfile, parse_dates=[[0, 1]]) df = df.rename(columns={'Date_Time': 'Date'}) df = df.set_index('Date') df['Exchange'] = file_exchange d[file_symbol] = df if cols is None: cols = df.columns except KeyboardInterrupt: logger.error("CTRL+C was pressed - exit") break except Exception as e: logger.error( "Can't build DataFrame for '%s' at '%s' from '%s'" % (file_symbol, file_exchange, filename.replace(' ', '\ '))) logger.error(traceback.format_exc()) d[file_symbol] = None df['Exchange'] = file_exchange else: logger.error( "Can't build DataFrame for '%s' at '%s' from '%s' (empty file)" % (file_symbol, file_exchange, filename.replace(' ', '\ '))) d[file_symbol] = None df['Exchange'] = file_exchange logger.info("Create Panel from DataFrame") panel = pd.Panel(d) panel = panel.transpose(2, 1, 0) panel.major_axis = panel.major_axis.map(lambda n: pd.to_datetime(str(n))) return (panel, df_info)
def read_file(pathName, input_filters_dict, random_state=None): ''' Reads the .csv file containing the labeled data images. Parameters ------------ pathName :path name of the zip file containing all the training data input_filters_dict: ['font', font_name] a font_name is a string or tuple containing a list of the fonts to be read from the database or empty to return all fonts or a string containing a single font name or None which will return all fonts. random_state: None for random seed chosen by the system or integer seed for the random seed for repeatable calls Returns ------------ a pandas shuffled Dataframe containing the columns from the csv file Note: The file to be read is a .zip file that in turn contains .csv files. Each .csv file contains images for a given font. This make access to a font, such as OCRA, fast because only one .csv file needs to be accessed. ''' if os.path.exists(pathName) == False: print('{} does not exist! Downloading it from the web'.format( default_zip_file), flush=True) downloadFile('http://lyman.house/download/{}'.format(default_zip_file)) #downloadFile('http://lyman.house/download/fonts_chinese.zip') try: rd_font = input_filters_dict['font'] if isinstance(rd_font, str): rd_font = (rd_font, ) except: rd_font = () with ZipFile(pathName, 'r') as myzip: if len(rd_font) == 0: names = myzip.namelist() print('\nreading all files...please wait') df = pd.concat( apply_column_filters(pd.read_csv(myzip.open(fname, 'r')), input_filters_dict) for fname in names) else: try: df = pd.concat( apply_column_filters( pd.read_csv(myzip.open(font + ".csv", 'r')), input_filters_dict) for font in rd_font) except: raise ValueError( 'Could not find font file {} in the zip file'.format( rd_font)) myzip.close() assert df.size > 0 return df.sample(frac=1, random_state=random_state)
def get_movielens_data(local_file=None, get_ratings=True, get_genres=False, split_genres=True, db_mapping=False): '''Downloads movielens data and stores it in pandas dataframe. ''' if not local_file: # downloading data zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' zip_response = get(zip_file_url) zip_contents = StringIO(zip_response.content) else: zip_contents = local_file ml_data = ml_genres = mapping = None # loading data into memory with ZipFile(zip_contents) as zfile: zip_files = pd.Series(zfile.namelist()) zip_file = zip_files[zip_files.str.contains('ratings')].iat[0] is_latest = 'latest' in zip_file header = 0 if is_latest else None if get_ratings: zdata = zfile.read(zip_file) delimiter = ',' zdata = zdata.replace( '::', delimiter) # makes data compatible with pandas c-engine ml_data = pd.read_csv( StringIO(zdata), sep=delimiter, header=header, engine='c', names=['userid', 'movieid', 'rating', 'timestamp'], usecols=['userid', 'movieid', 'rating']) if get_genres: zip_file = zip_files[zip_files.str.contains('movies')].iat[0] with zfile.open(zip_file) as zdata: delimiter = ',' if is_latest else '::' genres_data = pd.read_csv( zdata, sep=delimiter, header=header, engine='python', names=['movieid', 'movienm', 'genres']) ml_genres = get_split_genres( genres_data) if split_genres else genres_data if is_latest and db_mapping: # imdb and tmdb mapping - exists only in ml-latest datasets zip_file = zip_files[zip_files.str.contains('links')].iat[0] with zfile.open(zip_file) as zdata: mapping = pd.read_csv(zdata, sep=',', header=0, engine='c', names=['movieid', 'imdbid', 'tmdbid']) res = [data for data in [ml_data, ml_genres, mapping] if data is not None] if len(res) == 1: res = res[0] return res