def test_filename(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") pos = ["c:\\test", "c:\\test.txt", "..", ".txt", "r.", "r", "mqldkfnqmodnsc/\\y"] for p in pos: if not is_file_string(p): raise Exception(p) neg = ["h\ng", "r\tr", "cd:ggd.h"] for p in neg: if is_file_string(p): raise Exception(p)
def test_zip_to_df(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") dirname = os.path.abspath(os.path.dirname(__file__)) name = os.path.join(dirname, "data", "mynotebooks.zip") self.assertEqual(os.path.exists(name), True) self.assertEqual(is_file_string(name), True) dfs = read_csv(name, encoding="utf8", fvalid=lambda n: n != 'bank-names.txt') assert isinstance(dfs, dict) self.assertEqual(len(dfs), 3) fLOG(list(dfs.keys())) full = dfs["bank-full.csv"] assert isinstance(full, pandas.DataFrame)
def __init__(self, tick, url="yahoo", folder="cache", begin=None, end=None, sep=",", intern=False): """ Loads a stock price from either a url or a folder where the data was cached. If a filename ``<folder>/<tick>.<day1>.<day2>.txt`` already exists, it takes it from here. Otherwise, it downloads it. If url is yahoo, the data will be download using ``http://finance.yahoo.com/q/cp?s=^FCHI+Components``. The CAC40 composition is described `here <http://fr.wikipedia.org/wiki/CAC_40>`_. @param tick tick name, ex ``BNP.PA`` @param url if yahoo, downloads the data from there if it was not done before @param folder cache folder (created if it does not exists @param begin first day (datetime), see below @param end last day (datetime), see below @param sep column separator @param intern do not use unless you know what to do (see :meth:`__getitem__ <pyensae.finance.astock.StockPrices.__getitem__>`) If begin is None, the date will 2000/01/03 (it seems Yahoo Finance does not provide prices for a date before this one). If end is None, the date will the date of yesterday. .. exref:: :title: Compute the average returns and correlation matrix :: import pyensae, pandas from pyensae import StockPrices # download the CAC 40 composition from my website pyensae.download_data('cac40_2013_11_11.txt', website = 'xd') # download all the prices (if not already done) and store them into files actions = pandas.read_csv("cac40_2013_11_11.txt", sep = "\t") # we remove stocks with not enough historical data stocks = { k:StockPrices(tick = k) for k,v in actions.values if k != "SOLB.PA"} dates = StockPrices.available_dates( stocks.values() ) stocks = { k:v for k,v in stocks.items() if len(v.missing(dates)) <= 10 } print ("nb left", len(stocks)) # we remove dates with missing prices dates = StockPrices.available_dates( stocks.values() ) ok = dates[ dates["missing"] == 0 ] print ("all dates before", len(dates), " after:" , len(ok)) for k in stocks : stocks[k] = stocks[k].keep_dates(ok) # we compute correlation matrix and returns ret, cor = StockPrices.covariance(stocks.values(), cov = False, ret = True) You should also look at `pyensae et notebook <http://www.xavierdupre.fr/blog/notebooks/example%20pyensae.html>`_. If you use Google Finance as a provider, the tick name is usually prefixed by the market places (NASDAQ for example). The export does not work for all markets places. """ if isinstance(url, pandas.DataFrame): self.datadf = url self.tickname = tick if "Date" not in url.columns: raise Exception( "the dataframe does not contain any column 'Date': {0}".format( ",".join( _ for _ in url.columns))) elif isinstance(tick, str) and is_file_string(tick) and os.path.exists(tick): with open(tick, "r") as f: for line in f.readlines(): if line.startswith('<!DOCTYPE html PUBLIC'): raise Exception( "pandas cannot parse the file, check your have access to internet: " + str(tick)) break try: self.datadf = pandas.read_csv(tick, sep=sep) except Exception as e: with open(tick, "r") as t: content = t.read() if "Firewall Authentication" in content: raise Exception( "pandas cannot parse the file, check your have access to internet: " + str(tick)) from e else: raise e else: if not os.path.exists(folder): try: os.mkdir(folder) except PermissionError as e: raise Exception( "PermissionError, unable to create directory " + folder + ", check you execute the program in a folder you have permission to modify (" + os.getcwd() + ")") from e self.tickname = tick if begin is None: begin = datetime.datetime(2000, 1, 3) if end is None: now = datetime.datetime.now() end = now - datetime.timedelta(1) sbeg = begin.strftime("%Y-%m-%d") send = end.strftime("%Y-%m-%d") name = os.path.join( folder, tick.replace(":", "_") + ".{0}.{1}.txt".format( sbeg, send)) if not os.path.exists(name): if url == "yahoo": url = "http://ichart.finance.yahoo.com/table.csv?s=%s&d={0}&e={1}&f={2}&g=d&a={3}&b={4}&c={5}&ignore=.csv".format( end.month - 1, end.day, end.year, begin.month - 1, begin.day, begin.year) url = url % tick use_url = True elif url in("yahoo", "google", "fred", "famafrench"): import pandas_datareader.data as web df = web.DataReader(self.tickname, url, begin, end).reset_index(drop=False) df.to_csv(name, sep=sep, index=False) use_url = False else: raise Exception( "unable to download data from the following website" + str(tick) + " - " + url) if use_url: try: u = urllib.request.urlopen(url) text = u.read() u.close() except urllib.error.HTTPError as e: raise Exception( "HTTPError, unable to load tick " + tick + "\nURL: " + url) from e if len(text) < 10: raise Exception("nothing to download for " + tick + " less than 10 downloaded bytes") try: f = open(name, "wb") f.write(text) f.close() except PermissionError as e: raise Exception( "PermissionError, unable to create directory " + folder + ", check you execute the program in a folder you have permission to modify (" + os.getcwd() + ")") from e try: self.datadf = pandas.read_csv(name, sep=sep) except Exception as e: with open(tick, "r") as t: content = t.read() if "Firewall Authentication" in content: raise Exception( "pandas cannot parse the file, check your have access to internet" + str(tick)) from e else: raise e if not intern: try: self.datadf = self.datadf.sort_values("Date") except AttributeError: self.datadf = self.datadf.sort("Date") except KeyError as e: raise StockPricesException("schema: {}".format( ",".join(self.datadf.columns))) from e self.datadf.reset_index(drop=True, inplace=True) self.datadf.set_index("Date", drop=False, inplace=True)
def hive_submit( self, hive_file_or_query, params=None, redirection="redirection.hive", no_exception=True, fLOG=noLOG ): """ submits a PIG script, it first upload the script to the default folder and submit it @param hive_file_or_query pig script (local) @param params parameters to send to the job @param redirection string empty or not @param no_exception sent to @see me execute_command @param fLOG logging function @return out, err from @see me execute_command If *redirection* is not empty, the job is submitted but the function returns after the standard output and error were redirected to ``redirection.hive.out`` and ``redirection.hive.err``. The function executes the command line:: hive -f <filename> Or:: hive -e <query> With redirection:: hive -execute -f <filename> 2> redirection.hive.err 1> redirection.hive.out & If there is no redirection, the function waits and return the output. .. exref:: :title: Submit a HIVE query :tag: Hadoop :: client = ASSHClient() hive_sql = ''' DROP TABLE IF EXISTS bikes20; CREATE TABLE bikes20 (sjson STRING); LOAD DATA INPATH "/user/__USERNAME__/unittest2/paris*.txt" INTO TABLE bikes20; SELECT * FROM bikes20 LIMIT 10; '''.replace("__USERNAME__", self.client.username) out,err = client.hive_submit(hive_sql, redirection=None) .. versionadded:: 1.1 """ if is_file_string(hive_file_or_query) and os.path.exists(hive_file_or_query): dest = os.path.split(hive_file_or_query)[-1] self.upload(hive_file_or_query, dest) command = "-f" else: command = "-e" dest = hive_file_or_query.replace("\n", " ").replace("\r", "").replace("\t", " ") dest = dest.replace("'", "\\'") dest = "'{}'".format(dest.strip()) if params is not None: sparams = ASSHClient.build_command_line_parameters(params, "-hiveconf") if len(sparams) > 0: sparams = " " + sparams else: sparams = "" if redirection is None: cmd = "hive {0} {1}{2}".format(command, dest, sparams) else: cmd = "hive {0} {1}{2} 2> {3}.err 1> {3}.out &".format(command, dest, sparams, redirection) if isinstance(cmd, list): raise TypeError("this should not happen:" + str(cmd)) warnings.warn("Hive submission is not tested. It will probably fail.") fLOG("[hive_submit]:", cmd) out, err = self.execute_command(cmd, no_exception=no_exception) return out, err
def __init__(self, tick, url="yahoo", folder="cache", begin=None, end=None, sep=",", intern=False): """ Loads a stock price from either a url or a folder where the data was cached. If a filename ``<folder>/<tick>.<day1>.<day2>.txt`` already exists, it takes it from here. Otherwise, it downloads it. If url is yahoo, the data will be download using ``http://finance.yahoo.com/q/cp?s=^FCHI+Components``. The CAC40 composition is described `here <http://fr.wikipedia.org/wiki/CAC_40>`_. @param tick tick name, ex ``BNP.PA`` @param url if yahoo, downloads the data from there if it was not done before @param folder cache folder (created if it does not exists @param begin first day (datetime), see below @param end last day (datetime), see below @param sep column separator @param intern do not use unless you know what to do (see :meth:`__getitem__ <pyensae.finance.astock.StockPrices.__getitem__>`) If begin is None, the date will 2000/01/03 (it seems Yahoo Finance does not provide prices for a date before this one). If end is None, the date will the date of yesterday. .. exref:: :title: Compute the average returns and correlation matrix :: import pyensae, pandas from pyensae import StockPrices # download the CAC 40 composition from my website pyensae.download_data('cac40_2013_11_11.txt', website = 'xd') # download all the prices (if not already done) and store them into files actions = pandas.read_csv("cac40_2013_11_11.txt", sep = "\t") # we remove stocks with not enough historical data stocks = { k:StockPrices(tick = k) for k,v in actions.values if k != "SOLB.PA"} dates = StockPrices.available_dates( stocks.values() ) stocks = { k:v for k,v in stocks.items() if len(v.missing(dates)) <= 10 } print ("nb left", len(stocks)) # we remove dates with missing prices dates = StockPrices.available_dates( stocks.values() ) ok = dates[ dates["missing"] == 0 ] print ("all dates before", len(dates), " after:" , len(ok)) for k in stocks : stocks[k] = stocks[k].keep_dates(ok) # we compute correlation matrix and returns ret, cor = StockPrices.covariance(stocks.values(), cov = False, ret = True) You should also look at `pyensae et notebook <http://www.xavierdupre.fr/blog/notebooks/example%20pyensae.html>`_. If you use Google Finance as a provider, the tick name is usually prefixed by the market places (NASDAQ for example). The export does not work for all markets places. """ if isinstance(url, pandas.DataFrame): self.datadf = url self.tickname = tick if "Date" not in url.columns: raise Exception( "the dataframe does not contain any column 'Date': {0}". format(",".join(_ for _ in url.columns))) elif isinstance(tick, str) and is_file_string(tick) and os.path.exists(tick): with open(tick, "r") as f: for line in f.readlines(): if line.startswith('<!DOCTYPE html PUBLIC'): raise Exception( "pandas cannot parse the file, check your have access to internet: " + str(tick)) break try: self.datadf = pandas.read_csv(tick, sep=sep) except Exception as e: with open(tick, "r") as t: content = t.read() if "Firewall Authentication" in content: raise Exception( "pandas cannot parse the file, check your have access to internet: " + str(tick)) from e else: raise e else: if not os.path.exists(folder): try: os.mkdir(folder) except PermissionError as e: raise Exception( "PermissionError, unable to create directory " + folder + ", check you execute the program in a folder you have permission to modify (" + os.getcwd() + ")") from e self.tickname = tick if begin is None: begin = datetime.datetime(2000, 1, 3) if end is None: now = datetime.datetime.now() end = now - datetime.timedelta(1) sbeg = begin.strftime("%Y-%m-%d") send = end.strftime("%Y-%m-%d") name = os.path.join( folder, tick.replace(":", "_") + ".{0}.{1}.txt".format(sbeg, send)) if not os.path.exists(name): if url == "yahoo": url = "http://ichart.finance.yahoo.com/table.csv?s=%s&d={0}&e={1}&f={2}&g=d&a={3}&b={4}&c={5}&ignore=.csv".format( end.month - 1, end.day, end.year, begin.month - 1, begin.day, begin.year) url = url % tick use_url = True elif url in ("yahoo", "google", "fred", "famafrench"): import pandas_datareader.data as web df = web.DataReader(self.tickname, url, begin, end).reset_index(drop=False) df.to_csv(name, sep=sep, index=False) use_url = False else: raise Exception( "unable to download data from the following website" + str(tick) + " - " + url) if use_url: try: u = urllib.request.urlopen(url) text = u.read() u.close() except urllib.error.HTTPError as e: raise Exception("HTTPError, unable to load tick " + tick + "\nURL: " + url) from e if len(text) < 10: raise Exception("nothing to download for " + tick + " less than 10 downloaded bytes") try: f = open(name, "wb") f.write(text) f.close() except PermissionError as e: raise Exception( "PermissionError, unable to create directory " + folder + ", check you execute the program in a folder you have permission to modify (" + os.getcwd() + ")") from e try: self.datadf = pandas.read_csv(name, sep=sep) except Exception as e: with open(tick, "r") as t: content = t.read() if "Firewall Authentication" in content: raise Exception( "pandas cannot parse the file, check your have access to internet" + str(tick)) from e else: raise e if not intern: try: self.datadf = self.datadf.sort_values("Date") except AttributeError: self.datadf = self.datadf.sort("Date") except KeyError as e: raise StockPricesException("schema: {}".format(",".join( self.datadf.columns))) from e self.datadf.reset_index(drop=True, inplace=True) self.datadf.set_index("Date", drop=False, inplace=True)