def get_indexes(): """ Returns list of supported indexes https://eodhistoricaldata.com/knowledgebase/list-supported-indexes/ """ data = """ID Exchange Code Code Index Name 1 INDX GSPC S&P 500 2 INDX GDAXI DAX Index 3 INDX SSEC Shanghai Composite Index (China) 4 INDX MERV MERVAL Index (Argentina) 5 INDX FTSE FTSE 100 Index (UK) 6 INDX AORD All Ordinaries Index (Australia) 7 INDX BSESN BSE 30 Sensitivity Index (SENSEX) 8 INDX VIX VIX S&P 500 Volatility Index 9 INDX HSI Hang Seng Index (Hong Kong) 10 INDX GSPTSE S&P TSX Composite Index (Canada) 11 INDX FCHI CAC 40 Index 12 INDX TA100 Tel Aviv 100 Index (Israel 13 INDX CYC Morgan Stanley Cyclical Index 14 INDX IIX Interactive Week Internet Index 15 INDX CMR Morgan Stanley Consumer Index 16 INDX GOX CBOE Gold Inde 17 INDX RTS_RS RTSI Index 18 INDX GD_AT Athens Composite Inde 19 INDX FTSEMIB_MI Untitled Dataset 2015-07-13 20:00:12 20 INDX WILREIT Wilshire US REIT Inde 21 INDX W5KMCG Wilshire US Mid Cap Growt 22 INDX IBEX IBEX 35 Index 23 INDX W5KLCV Wilshire US Large Cap Valu 24 INDX SSMI Swiss Market Index 25 INDX OEX S&P 100 Inde 26 INDX RUI Russell 1000 Inde 27 INDX XAX NYSE AMEX Composite Inde 28 INDX WILRESI Wilshire US Real Estate Securities Inde 29 INDX NZ50 NZSE 50 (New Zealand) 30 INDX UTY PHLX Utility Sector Inde 31 INDX CSE Colombo All Shares Index (Sri Lanka 32 INDX XOI NYSE AMEX Oil Inde 33 INDX OSX PHLX Oil Service Sector Inde 34 INDX XAL NYSE AMEX Airline Inde 35 INDX W5KSCG Wilshire US Small Cap Growt 36 INDX TWII Taiwan Weighted Inde 37 INDX ATX ATX Index (Austria 38 INDX NWX NYSE ARCA Networking Inde 39 INDX W5KSCV Wilshire US Small Cap Valu 40 INDX XAU PHLX Gold/Silver Sector Inde 41 INDX W5KMCV Wilshire US Mid Cap Valu 42 INDX WGREIT Wilshire Global REIT Inde 43 INDX SML S&P Small-Cap 600 Inde 44 INDX RUT Russell 2000 Inde 45 INDX JKSE Jakarta Composite Index (Indonesia 46 INDX BFX Euronext BEL-20 Index (Belgium) 47 INDX XBD NYSE AMEX Securities Broker/Dealer Inde 48 INDX RUA Russell 3000 Inde 49 INDX XII NYSE ARCA Institutional Inde 50 INDX IETP ISEQ 20 Price Index (Ireland 51 INDX DRG NYSE AMEX Pharmaceutical Inde 52 INDX W5000 Wilshire 5000 Total Market Inde 53 INDX HGX PHLX Housing Sector Inde 54 INDX MXX IPC Index (Mexico) 55 INDX W5KLCG Wilshire US Large Cap Growt 56 INDX STI Straits Times Index 57 INDX KS11 KOSPI Composite Index 58 INDX AEX AEX Amsterdam Index 59 INDX NYA NYSE Composite Index 60 INDX XMI NYSE ARCA Major Market Inde 61 INDX BTK NYSE AMEX Biotechnology Inde 62 INDX EPX NASDAQ SIG Oil Exploration and Production Inde 63 INDX MID S&P Mid-Cap 400 Inde 64 INDX HUI NYSE Arca Gold Bugs Inde 65 INDX SOX PHLX Semiconductor Inde 66 INDX HCX CBOE S&P Healthcare Index 67 INDX XCI NYSE AMEX Computer Technology Inde 68 INDX XNG NYSE AMEX Natural Gas Inde 69 INDX RMZ MSCI US REIT Inde 70 INDX WGRESI Wilshire Global Real Estate Securities Inde 71 INDX N225 Nikkei 225 Index (Japan 72 INDX VDAX Deutsche Boerse VDAX Volatility Inde 73 INDX MXY NYSE ARCA Mexico Inde 74 INDX OSEAX Oslo Exchange All Share Index (Norway) 75 INDX TYX Treasury Yield 30 Years Inde 76 INDX DJI Dow Jones Industrial Average 77 INDX AXPJ S&P/ASX 200 Australia REIT Inde 78 INDX PSI20 PSI 20 Stock Index (Portugal 79 INDX IRX 13-week Treasury Bill Inde 80 INDX FVX Treasury Yield 5 Years Inde 81 INDX NYI NYSE International 100 Index 82 INDX AXJO S&P/ASX 200 Index (Australia 83 INDX 512NTR S&P 500 GBP Hdg (Net TR) (^512NTR) 84 INDX CTES_VI Czech Trading Inde 85 INDX NSEI S&P/CNX Nifty Index (India 86 INDX NYY NYSE TMT Inde 87 INDX CCSI EGX 70 Price Index (Egypt 88 INDX SPSUPX S&P Composite 1500 Inde 89 INDX BVSP Bovespa Index (Brazil) 90 INDX ISEQ ISEQ Overall Price Index (Ireland 91 INDX JPN NYSE AMEX Japan Inde 92 INDX NYL NYSE World Leaders Inde 93 INDX TNX CBOE Interest Rate 10-Year T-Note Inde 94 INDX NY NYSE US 100 Inde 95 INDX SPLV PowerShares S&P 500 Low Volatil 96 INDX OMXSPI Stockholm General Index (Sweden) 97 INDX GVZ CBOE Gold Volatility Inde 98 INDX SPY SPDR S&P 500 (SPY 99 INDX IEQR_IR ISEQ General Total Return Index (Ireland 100 INDX OMXC20_CO OMX Copenhagen 20 Index 101 INDX DJUSFN ^DJUSFN: Dow Jones U.S. Financials Inde 102 INDX DJASD ^DJASD: Dow Jones Asia Select Dividen 103 INDX IMUS ^IMUS: Dow Jones Islamic Market U.S. 104 INDX W1SGI ^W1SGI: Dow Jones Sustainability Worl 105 INDX DJT ^DJT: Dow Jones Transportation Averag 106 INDX DJUSM ^DJUSM: Dow Jones U.S. Mid-Cap Inde 107 INDX W1XGA ^W1XGA: Dow Jones Sustainability Worl 108 INDX DWC ^DWC: DJUS Market Index (full-cap 109 INDX DJC ^DJC: Dow Jones-UBS Commodity Inde 110 INDX IMXL ^IMXL: Dow Jones Islamic Market Titan 111 INDX XLHK ^XLHK: Dow Jones Hong Kong Titans 30 112 INDX DJTMDI ^DJTMDI: Dow Jones Media Titans 30 Inde 113 INDX DJU ^DJU: Dow Jones Utility Averag 114 INDX DWCOGS ^DWCOGS: Dow Jones U.S. Oil & Gas Tota 115 INDX DJUSST ^DJUSST: Dow Jones U.S. Iron & Steel In 116 INDX PSE ^PSE: NYSE Arca Tech 100 Index - New York Stock Exchange 117 INDX DWCF ^DWCF: Dow Jones U.S. Total Stock Mar 118 INDX W1SUS ^W1SUS: Dow Jones Sustainability Worl 119 INDX DJASDT ^DJASDT: Dow Jones Asia Select Dividen 120 INDX RCI ^RCI: Dow Jones Composite All REIT I 121 INDX DJUSL ^DJUSL: Dow Jones U.S. Large-Cap Inde 122 INDX P1DOW ^P1DOW: Dow Jones Asia/Pacific Inde 123 INDX DJAT ^DJAT: Dow Jones Asian Titans 50 Inde 124 INDX DJUS ^DJUS: Dow Jones U.S. Inde 125 INDX DWMI ^DWMI: Dow Jones U.S. Micro-Cap Tota 126 INDX DJUSS ^DJUSS: Dow Jones U.S. Small-Cap Inde 127 INDX OMX OMXS 30 Index (Sweden 128 INDX STOXX50E EuroStoxx 50 Inde 129 INDX FTAS FTSE All-Share Index (UK) 130 INDX WIHUN_L FTSE HUngary Index 131 INDX WITUR_L FTSE Turkey Index 132 INDX WITHA_L FTSE Thailand Index 133 INDX WIPOL_L FTSE Poland Index 134 INDX WICZH_L FTSE Czech Republic Index 135 INDX OMXC20 OMX Copenhagen 20 Inde 136 INDX IXE ^IXE: Select Sector Spdr-energy Inde 137 INDX IXIC NASDAQ Composite 138 INDX SPEUP S&P EUROPE 350""" df = pd.read_csv(StringIO(data), sep="\t") df = df.set_index("ID") return (df)
def test_header_multi_index(self): expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 C1,,C_l1_g0,C_l1_g1,C_l1_g2 C2,,C_l2_g0,C_l2_g1,C_l2_g2 C3,,C_l3_g0,C_l3_g1,C_l3_g2 R0,R1,,, R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(df, expected) # skipping lines in the header df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(df, expected) # INVALID OPTIONS # no as_recarray with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], as_recarray=True) # names pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], names=['foo', 'bar']) # usecols pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], usecols=['foo', 'bar']) # non-numeric index_col pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=['foo', 'bar'])
def _test(text, **kwargs): nice_text = text.replace('\r', '\r\n') result = TextReader(StringIO(text), **kwargs).read() expected = TextReader(StringIO(nice_text), **kwargs).read() assert_array_dicts_equal(result, expected)
def _get_handle(path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : a path (str) or buffer mode : str mode to open path_or_buf with encoding : str or None compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) Returns ------- f : file-like A file-like object handles : list of file-like objects A list of file-like object that were opened in this function. """ try: from s3fs import S3File need_text_wrapping = (BytesIO, S3File) except ImportError: need_text_wrapping = (BytesIO,) handles = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, compat.string_types) if is_path: compression = _infer_compression(path_or_buf, compression) if compression: if compat.PY2 and not is_path and encoding: msg = 'compression with encoding is not yet supported in Python 2' raise ValueError(msg) # GZ Compression if compression == 'gzip': if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression elif compression == 'bz2': if is_path: f = bz2.BZ2File(path_or_buf, mode) elif compat.PY2: # Python 2's bz2 module can't take file objects, so have to # run through decompress manually f = StringIO(bz2.decompress(path_or_buf.read())) path_or_buf.close() else: f = bz2.BZ2File(path_or_buf) # ZIP Compression elif compression == 'zip': zf = BytesZipFile(path_or_buf, mode) # Ensure the container is closed as well. handles.append(zf) if zf.mode == 'w': f = zf elif zf.mode == 'r': zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' .format(path_or_buf)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP: {}' .format(zip_names)) # XZ Compression elif compression == 'xz': f = lzma.LZMAFile(path_or_buf, mode) # Unrecognized Compression else: msg = 'Unrecognized compression type: {}'.format(compression) raise ValueError(msg) handles.append(f) elif is_path: if compat.PY2: # Python 2 mode = "wb" if mode == "w" else mode f = open(path_or_buf, mode) elif encoding: # Python 3 and encoding f = open(path_or_buf, mode, encoding=encoding, newline="") elif is_text: # Python 3 and no explicit encoding f = open(path_or_buf, mode, errors='replace', newline="") else: # Python 3 and binary mode f = open(path_or_buf, mode) handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding if (compat.PY3 and is_text and (compression or isinstance(f, need_text_wrapping))): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding, newline='') handles.append(f) if memory_map and hasattr(f, 'fileno'): try: g = MMapWrapper(f) f.close() f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles
def test_read_with_bad_header(self): errmsg = r"but only \d+ lines in file" with tm.assert_raises_regex(ValueError, errmsg): s = StringIO(',,') self.read_csv(s, header=[10])
def _get_handle(path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : a path (str) or buffer mode : str mode to open path_or_buf with encoding : str or None compression : str or None Supported compression protocols are gzip, bz2, zip, and xz memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) Returns ------- f : file-like A file-like object handles : list of file-like objects A list of file-like object that were openned in this function. """ handles = list() f = path_or_buf is_path = isinstance(path_or_buf, compat.string_types) if compression: if compat.PY2 and not is_path and encoding: msg = 'compression with encoding is not yet supported in Python 2' raise ValueError(msg) # GZ Compression if compression == 'gzip': import gzip if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression elif compression == 'bz2': import bz2 if is_path: f = bz2.BZ2File(path_or_buf, mode) elif compat.PY2: # Python 2's bz2 module can't take file objects, so have to # run through decompress manually f = StringIO(bz2.decompress(path_or_buf.read())) path_or_buf.close() else: f = bz2.BZ2File(path_or_buf) # ZIP Compression elif compression == 'zip': import zipfile zip_file = zipfile.ZipFile(path_or_buf) zip_names = zip_file.namelist() if len(zip_names) == 1: f = zip_file.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' .format(path_or_buf)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP: {}' .format(zip_names)) # XZ Compression elif compression == 'xz': lzma = compat.import_lzma() f = lzma.LZMAFile(path_or_buf, mode) # Unrecognized Compression else: msg = 'Unrecognized compression type: {}'.format(compression) raise ValueError(msg) handles.append(f) elif is_path: if compat.PY2: # Python 2 f = open(path_or_buf, mode) elif encoding: # Python 3 and encoding f = open(path_or_buf, mode, encoding=encoding) elif is_text: # Python 3 and no explicit encoding f = open(path_or_buf, mode, errors='replace') else: # Python 3 and binary mode f = open(path_or_buf, mode) handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding if compat.PY3 and is_text and\ (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) if memory_map and hasattr(f, 'fileno'): try: g = MMapWrapper(f) f.close() f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles
def bdi(itype='D', retry_count=3, pause=0.001): for _ in range(retry_count): time.sleep(pause) try: request = Request(ct.BDI_URL % (ct.P_TYPE['http'], ct.DOMAINS['v500'])) lines = urlopen(request, timeout=10).read() if len(lines) < 100: #no data return None except Exception as e: print(e) else: linestr = lines.decode('utf-8') if ct.PY3 else lines if itype == 'D': # Daily reg = re.compile(r'\"chart_data\",\"(.*?)\"\);') lines = reg.findall(linestr) lines = lines[0] lines = lines.replace('chart', 'table').\ replace('</series><graphs>', '').\ replace('</graphs>', '').\ replace('series', 'tr').\ replace('value', 'td').\ replace('graph', 'tr').\ replace('graphs', 'td') df = pd.read_html(lines, encoding='utf8')[0] df = df.T df.columns = ['date', 'index'] df['date'] = df['date'].map(lambda x: x.replace(u'年', '-')).\ map(lambda x: x.replace(u'月', '-')).\ map(lambda x: x.replace(u'日', '')) df['date'] = pd.to_datetime(df['date']) df['index'] = df['index'].astype(float) df = df.sort_values('date', ascending=False).reset_index(drop=True) df['change'] = df['index'].pct_change(-1) df['change'] = df['change'] * 100 df['change'] = df['change'].map(lambda x: '%.2f' % x) df['change'] = df['change'].astype(float) return df else: #Weekly html = lxml.html.parse(StringIO(linestr)) res = html.xpath( "//table[@class=\"style33\"]/tr/td/table[last()]") if ct.PY3: sarr = [ etree.tostring(node).decode('utf-8') for node in res ] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>' % sarr df = pd.read_html(sarr)[0][1:] df.columns = ['month', 'index'] df['month'] = df['month'].map(lambda x: x.replace(u'年', '-')).\ map(lambda x: x.replace(u'月', '')) df['month'] = pd.to_datetime(df['month']) df['month'] = df['month'].map(lambda x: str(x).replace('-', '')).\ map(lambda x: x[:6]) df['index'] = df['index'].astype(float) df['change'] = df['index'].pct_change(-1) df['change'] = df['change'].map(lambda x: '%.2f' % x) df['change'] = df['change'].astype(float) return df
def test_pass_offset_warn(self): buf = StringIO() sys.stderr = buf DatetimeIndex(start='1/1/2000', periods=10, offset='H') sys.stderr = sys.__stderr__
def test_value_counts_datetime64(self): klasses = [Index, Series] for klass in klasses: # GH 3002, datetime64[ns] # don't test names though txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) s.name = None idx = pd.to_datetime([ '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat([ '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z' ], dtype='datetime64[ns]') if isinstance(s, Index): tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: tm.assert_numpy_array_equal(s.unique(), expected) assert s.nunique() == 3 # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() assert result.index.dtype == 'datetime64[ns]' tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() assert unique.dtype == 'datetime64[ns]' # numpy_array_equal cannot compare pd.NaT if isinstance(s, Index): exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) tm.assert_index_equal(unique, exp_idx) else: tm.assert_numpy_array_equal(unique[:3], expected) assert pd.isna(unique[3]) assert s.nunique() == 3 assert s.nunique(dropna=False) == 4 # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name='dt') result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days'], name='dt') if isinstance(td, Index): tm.assert_index_equal(td.unique(), expected) else: tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) assert filepath_or_buffer == input_buffer
import dash import dash_html_components as html import pandas as pd from pandas.compat import StringIO from webviz_components import (Layout, Map, Page) from os import path path_to_data = path.join(path.dirname(__file__), "reek.csv") cells = pd.read_csv( StringIO(""" i,j,k,x0,y0,x1,y1,x2,y2,x3,y3,value,FLOWI+,FLOWJ+ 0,0,0,0,0,1,0,1,1,0,1,1,0.005,0.0025 1,0,0,1,0,2,0,2,1,1,1,0,0.002,0.0045 0,1,0,0,1,1,1,1,2,0,2,4,0.001,0.0025 1,1,0,1,1,2,1,2,2,1,2,2,0.004,0.0035 """)) app = dash.Dash(__name__, external_stylesheets=['assets/theme.css']) server = app.server app.css.config.serve_locally = True app.scripts.config.serve_locally = True app.layout = Layout(children=[ Page(id='reek', title='Map with Reek data', children=[ html.H1(children='Reek'), html.P(children=''' This is an example of Map using Reek data
def _lexer_split_from_str(dt_str): # The StringIO(str(_)) is for dateutil 2.2 compatibility return _timelex.split(StringIO(str(dt_str)))
def read_bucket_csv(gcs_path): file_stream = file_io.FileIO(gcs_path, mode='r') data = pd.read_csv(StringIO(file_stream.read()), delimiter=';', header=None) return data
def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( input_buffer) assert filepath_or_buffer == input_buffer assert not should_close
def test_readjson_chunksize_requires_lines(lines_json_df): msg = "chunksize can only be passed if lines=True" with pytest.raises(ValueError, match=msg): pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
def test_converters_type_must_be_dict(self): data = """index,A,B,C,D foo,2,3,4,5 """ with tm.assert_raises_regex(TypeError, 'Type converters.+'): self.read_csv(StringIO(data), converters=0)
### blast = subprocess.call([yyyyy,'-db',zzzzz,'-query', identifier,'-evalue',uuuuu,'-outfmt', "6 qacc sacc qlen slen length score bitscore evalue pident nident mismatch positive gaps gapopen stitle", '-max_target_seqs','10','-max_hsps','1','-out', identifier+'.txt']) ### out = open(identifier+'.txt','r') out = out.read() n += 1 if len(out) == 0: out_bytes = len(out) sin_resultados.append('Secuencia '+str(n)+' | '+str(identifier)) os.remove(identifier) os.remove(identifier+'.txt') continue else: out_text.append(pd.read_csv(StringIO(out),sep='\t',header=None)) out_bytes = len(out) ### dif = max([i for i in range(1, out_bytes+100,100)]) - out_bytes total_length = int(out_bytes) dld += out_bytes dl = 0 for dat in [i for i in range(1, out_bytes+100,100)]: tim = datetime.now() - xx dl = dat - dif done = int(30 * dl / total_length) sys.stdout.write('\rSecuencia '+str(n)+' | %s | %s' % ('{}'.format(tim).split('.')[0], identifier)) sys.stdout.flush() os.remove(identifier) os.remove(identifier+'.txt') ###
def test_converters_corner_with_nas(self): # skip aberration observed on Win64 Python 3.2.2 if hash(np.int64(-1)) != -2: pytest.skip("skipping because of windows hash on Python" " 3.2.2") data = """id,score,days 1,2,12 2,2-5, 3,,14+ 4,6-12,2""" def convert_days(x): x = x.strip() if not x: return np.nan is_plus = x.endswith('+') if is_plus: x = int(x[:-1]) + 1 else: x = int(x) return x def convert_days_sentinel(x): x = x.strip() if not x: return np.nan is_plus = x.endswith('+') if is_plus: x = int(x[:-1]) + 1 else: x = int(x) return x def convert_score(x): x = x.strip() if not x: return np.nan if x.find('-') > 0: valmin, valmax = lmap(int, x.split('-')) val = 0.5 * (valmin + valmax) else: val = float(x) return val fh = StringIO(data) result = self.read_csv(fh, converters={ 'score': convert_score, 'days': convert_days }, na_values=['', None]) assert pd.isna(result['days'][1]) fh = StringIO(data) result2 = self.read_csv(fh, converters={ 'score': convert_score, 'days': convert_days_sentinel }, na_values=['', None]) tm.assert_frame_equal(result, result2)
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False).sort_values() expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({ 0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({ 0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal( s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] # don't test names though txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) s.name = None idx = pd.to_datetime([ '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array([ '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z' ], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name='dt') result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days']) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) self.assertEqual(filepath_or_buffer, input_buffer)
""" Alice Lepissier [email protected] July 2018 Risk-based IFF Scrape the BIS website for data """ import requests, os import pandas as pd from pandas.compat import StringIO os.chdir( 'C:/cloudstorage/googledrive/Projects/Tax Justice Network/Consultancy 2 - summer 18/Risk-based IFF' ) gets = pd.read_csv('Data/LBS/LBS_GET.csv', header=None) urls = gets[0].tolist() df = pd.DataFrame() for url in urls: req = requests.get(url) req_string = req.text data = pd.read_csv(StringIO(req_string), sep=',', skiprows=6) df = df.append(data, sort=False) df.to_csv('Data/LBS/LBS.csv')
def lhb_detail(code=None, date=None, retry_count=3, pause=0.001): """ 获取个股龙虎榜明细数据 Parameters -------- code:str 股票代码 date:str 日期 retry_count : int, 默认 3 如遇网络等问题重复执行的次数 pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 Return ------ tuple(DataFrame1,DateFrame2) DataFrame code:股票代码 date:日期 broker:营业部名称 count:上榜次数 probability:买入后上涨概率 buy:买入金额(万) buy_prop:买入额占总成交比 sell:卖出金额(万) sell_prop:卖出额占总成交比 net:净额(万) buysellflag:买入卖出标记 """ df1 = None df2 = None for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_DETAIL % (date, code)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//tbody") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr.pop(0) #舍弃无用的第一个元素 sarr = ''.join(sarr) sarr = sarr.replace('tbody', 'table') #规范格式 list_sarr = pd.read_html(sarr) #由于有两个<table>,所以得到两个dataframe for i in range(len(list_sarr)): df = list_sarr[i] #处理dataframe # 删除卖出表的最后一行统计数据 if df.iloc[:, 0].size == 6: #只保留5行数据 df.drop([5], inplace=True, axis=0) # 删除第一列无用的序号 df.drop([0], inplace=True, axis=1) #处理机构那一列,拆分成三列 df[1] = df[1].map(lambda x: str(x).split(" ")) #引号里为两个空格 try: #将原有列拆分成三个新的Series ser1 = df[1].map(lambda x: x[0]) ser2 = df[1].map(lambda x: x[1]) ser3 = df[1].map(lambda x: x[2]) except Exception as e: pass df.drop(1, inplace=True, axis=1) #删除原有列 df.insert(0, 'broker', ser1) df.insert(1, 'count', ser2) df.insert(2, 'per', ser3) #在最前面增加code与date两列 df.insert(0, 'code', str(code)) df.insert(1, 'date', str(date)) #在最后面加上买卖标记 if i == 0: df.insert(len(df.columns), 'buysellflag', 'buy') elif i == 1: df.insert(len(df.columns), 'buysellflag', 'sell') #判断如果字段个数,若一致,则套用表头 if len(df.columns) == 11: df.columns = rv.LHB_DETAIL_COLS elif len(df.columns) == 10: #若只有10列,说明没有买入卖出机构: #如何处理 df.insert(5, 'None', None) df.columns = rv.LHB_DETAIL_COLS df.loc[0] = [ code, date, '没有买入或卖出机构', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None' ] #赋值 if i == 0: df1 = df elif i == 1: df2 = df list1 = [df1, df2] return pd.concat(list1) except Exception as e: print(e)
def test_header_multiindex_common_format(self): df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], index=['one', 'two'], columns=MultiIndex.from_tuples([('a', 'q'), ('a', 'r'), ('a', 's'), ('b', 't'), ('c', 'u'), ('c', 'v')])) # to_csv data = """,a,a,a,b,c,c ,q,r,s,t,u,v ,,,,,, one,1,2,3,4,5,6 two,7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) tm.assert_frame_equal(df, result) # common data = """,a,a,a,b,c,c ,q,r,s,t,u,v one,1,2,3,4,5,6 two,7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) tm.assert_frame_equal(df, result) # common, no index_col data = """a,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=None) tm.assert_frame_equal(df.reset_index(drop=True), result) # malformed case 1 expected = DataFrame(np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), index=Index([1, 7]), columns=MultiIndex(levels=[[ u('a'), u('b'), u('c') ], [u('r'), u('s'), u('t'), u('u'), u('v')]], labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], names=[u('a'), u('q')])) data = """a,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) tm.assert_frame_equal(expected, result) # malformed case 2 expected = DataFrame(np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), index=Index([1, 7]), columns=MultiIndex(levels=[[ u('a'), u('b'), u('c') ], [u('r'), u('s'), u('t'), u('u'), u('v')]], labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], names=[None, u('q')])) data = """,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) tm.assert_frame_equal(expected, result) # mi on columns and index (malformed) expected = DataFrame(np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'), index=MultiIndex(levels=[[1, 7], [2, 8]], labels=[[0, 1], [0, 1]]), columns=MultiIndex( levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]], labels=[[0, 1, 2, 2], [0, 1, 2, 3]], names=[None, u('q')])) data = """,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) tm.assert_frame_equal(expected, result)
def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line dtypes = [ 'int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool' ] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert "memory usage: " in res[-1] # do not display memory usage case df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() assert "memory usage: " not in res[-1] df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # memory usage is a lower bound, so print it as XYZ+ MB assert re.match(r"memory usage: [^+]+\+", res[-1]) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # excluded column with object dtype, so estimate is accurate assert not re.match(r"memory usage: [^+]+\+", res[-1]) # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {} n = 100 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) df.columns = dtypes df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) df_with_object_index.info(buf=buf, memory_usage='deep') res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+$", res[-1]) # Ensure df size is as expected # (cols * rows * bytes) + index size df_size = df.memory_usage().sum() exp_size = len(dtypes) * n * 8 + df.index.nbytes assert df_size == exp_size # Ensure number of cols in memory_usage is the same as df size_df = np.size(df.columns.values) + 1 # index=True; default assert size_df == np.size(df.memory_usage()) # assert deep works only on object assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() # test for validity DataFrame(1, index=['a'], columns=['A']).memory_usage(index=True) DataFrame(1, index=['a'], columns=['A']).index.nbytes df = DataFrame(data=1, index=pd.MultiIndex.from_product([['a'], range(1000)]), columns=['A']) df.index.nbytes df.memory_usage(index=True) df.index.values.nbytes mem = df.memory_usage(deep=True).sum() assert mem > 0
def test_singleton_header(self): # See GH #7757 data = """a,b,c\n0,1,2\n1,2,3""" df = self.read_csv(StringIO(data), header=[0]) expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) tm.assert_frame_equal(df, expected)
def test_repr_mixed(self): buf = StringIO() # mixed foo = repr(self.mixed_frame) # noqa self.mixed_frame.info(verbose=False, buf=buf)
def _make_reader(**kwds): return TextReader(StringIO(data), delimiter=',', **kwds)
def test_readjson_invalid_chunksize(lines_json_df, chunksize): msg = r"'chunksize' must be an integer >=1" with pytest.raises(ValueError, match=msg): pd.read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize)
def test_string_factorize(self): # should this be optional? data = 'a\nb\na\nb\na' reader = TextReader(StringIO(data), header=None) result = reader.read() assert len(set(map(id, result[0]))) == 2
'max': [6], 'min': [1] } obs_mock_data = { 'index': ['02-03-2006'], 'name': ['line-1'], 'value': [4], 'error': [2] } obs_csv = pd.DataFrame( pd.read_csv( StringIO(""" index,name,value,error 2012-01-04,line-3,17,2 2012-01-08,line-2,8,3 """))) obs_without_index = pd.read_csv( StringIO(""" name,value,error line-3,17,2 line-2,8,2 """)) class TestFanChart(unittest.TestCase): def test_using_csv(self): self.assertTrue(validate_observation_data(obs_csv)) try: