def fetchQuotes(sym, start=FROM_DATE, end=CURRENT_DATE): his = None data = None try: # print start, end data = ystockquote.get_historical_prices(sym, start, end) except Exception: print "Please check the dates. Data might not be available. 404 returned" # 404 due to data yet not available if data: his = DataFrame(collections.OrderedDict(sorted(data.items()))).T his = his.convert_objects(convert_numeric=True) his.index = pd.to_datetime(his.index) his.insert(0, 'symbol', sym, allow_duplicates=True) # insert the date as dataframe too his.insert(1, 'date', his.index) # his.columns = getColumns('stock_quote_historical') # Removing as db dependency is removed his.columns = getColumnsNoSql('stock_quote_historical') daily = ystockquote.get_all(sym) # print daily # persist(his, daily, sym, end) return his, daily
def homePageToSubjectPageDataframe(data): subject_dataframe = DataFrame(data,columns=['date','page_title','views','uniqueViews']) subject_dataframe = subject_dataframe.convert_objects(convert_numeric=True) subject_dataframe.drop('date', axis=1, inplace=True) subject_dataframe = subject_dataframe.groupby(['page_title']).sum().sort(['uniqueViews'],ascending=0) subject_dataframe.reset_index(inplace=True) subject_dataframe['subject'] = subject_dataframe['page_title'].apply(lambda title: strip_edx_page_title(title)) subject_dataframe['totalViews'] = subject_dataframe['uniqueViews'].sum() subject_dataframe['Pct'] = (subject_dataframe['uniqueViews'] / subject_dataframe['totalViews']) subject_dataframe = subject_dataframe[(subject_dataframe['Pct']>0.0001)] return subject_dataframe[['subject','uniqueViews','Pct']]
def get_hosts(): csv_file = "./hosts.csv" print "Hosts input file: %s" % csv_file hosts_raw = pd.read_csv(csv_file, sep=',', names=['#', 'Provider', 'Region', 'CPUs', 'Memory', 'Storage', 'DiskType', 'OS', 'Cost','Comment']) hosts = DataFrame(hosts_raw) hosts = hosts[1:] hosts = hosts.convert_objects(convert_numeric=True) print hosts return hosts
def enrollmentDataframe(enrolls_data, card_type, enroll_type): enrolls_dataframe = DataFrame( enrolls_data, columns=[ 'date', 'cardName', 'position', 'total{type}Enrolls'.format(type=enroll_type), 'unique{type}Enrolls'.format(type=enroll_type) ] ) enrolls_dataframe = enrolls_dataframe.convert_objects(convert_numeric=True) enrolls_dataframe.drop('date', axis=1, inplace=True) enrolls_dataframe['type'] = card_type return enrolls_dataframe
def get_hosts(): csv_file = "./hosts.csv" print "Hosts input file: %s" % csv_file hosts_raw = pd.read_csv(csv_file, sep=',', names=[ '#', 'Provider', 'Region', 'CPUs', 'Memory', 'Storage', 'DiskType', 'OS', 'Cost', 'Comment' ]) hosts = DataFrame(hosts_raw) hosts = hosts[1:] hosts = hosts.convert_objects(convert_numeric=True) print hosts return hosts
value_quant = observation.find('{http://hl7.org/fhir}valueQuantity') value = value_quant.find('{http://hl7.org/fhir}value') diastolic_bps.append(value.get('value')) if (not systolic_bps) & (not diastolic_bps): #print "No Systolic/Diastolic BP" continue if((len(encounter_dates)!=len(systolic_bps)) | (len(systolic_bps) != len(diastolic_bps))): continue enc_dict = {} enc_dict['encounter_date'] = encounter_dates enc_dict['systolic_bp'] = systolic_bps enc_dict['diastolic_bp'] = diastolic_bps encounters = DataFrame(enc_dict, columns=['encounter_date', 'diastolic_bp', 'systolic_bp']) encounters = encounters.convert_objects(convert_dates='coerce', convert_numeric=True) #print encounters #encounters.is_copy = False enc_period = encounters[(encounters.encounter_date.dt.year >= 2004) & (encounters.encounter_date.dt.year <= 2009)] if enc_period.empty: #print "No data between given period" continue enc_period['mean_bp'] = enc_period['diastolic_bp'] #+ ((enc_period['systolic_bp']-enc_period['diastolic_bp'])/3) mbp = enc_period['mean_bp'].mean() sbp = enc_period['mean_bp'].std() if math.isnan(sbp): continue std_bp.append(sbp) count = count +1
def _data_to_frame(data, header, index_col, infer_types, skiprows): """Parse a BeautifulSoup table into a DataFrame. Parameters ---------- data : tuple of lists The raw data to be placed into a DataFrame. This is a list of lists of strings or unicode. If it helps, it can be thought of as a matrix of strings instead. header : int or None An integer indicating the row to use for the column header or None indicating no header will be used. index_col : int or None An integer indicating the column to use for the index or None indicating no column will be used. infer_types : bool Whether to convert numbers and dates. skiprows : collections.Container or int or slice Iterable used to skip rows. Returns ------- df : DataFrame A DataFrame containing the data from `data` Raises ------ ValueError * If `skiprows` is not found in the rows of the parsed DataFrame. Raises ------ ValueError * If `skiprows` is not found in the rows of the parsed DataFrame. See Also -------- read_html Notes ----- The `data` parameter is guaranteed not to be a list of empty lists. """ thead, tbody, tfoot = data columns = thead or None df = DataFrame(tbody, columns=columns) if skiprows is not None: it = _get_skiprows_iter(skiprows) try: df = df.drop(it) except ValueError: raise ValueError('Labels {0} not found when trying to skip' ' rows'.format(it)) # convert to numbers/dates where possible # must be sequential since dates trump numbers if both args are given if infer_types: df = df.convert_objects(convert_numeric=True) df = df.convert_objects(convert_dates='coerce') if header is not None: header_rows = df.iloc[header] if header_rows.ndim == 2: names = header_rows.index df.columns = MultiIndex.from_arrays(header_rows.values, names=names) else: df.columns = header_rows df = df.drop(df.index[header]) if index_col is not None: cols = df.columns[index_col] try: cols = cols.tolist() except AttributeError: pass # drop by default df.set_index(cols, inplace=True) if df.index.nlevels == 1: if isnull(df.index.name) or not df.index.name: df.index.name = None else: names = [name or None for name in df.index.names] df.index = MultiIndex.from_tuples(df.index.values, names=names) return df
def Get_dataframe(Symbol): Symbolx = str(Symbol) + '_history' Db_cursor = MongoClient()['stox'][Symbolx].find() x = DataFrame(list(Db_cursor)) x = x.convert_objects(convert_numeric=True) return x
diastolic_bps.append(value.get('value')) if (not systolic_bps) & (not diastolic_bps): #print "No Systolic/Diastolic BP" continue if ((len(encounter_dates) != len(systolic_bps)) | (len(systolic_bps) != len(diastolic_bps))): continue enc_dict = {} enc_dict['encounter_date'] = encounter_dates enc_dict['systolic_bp'] = systolic_bps enc_dict['diastolic_bp'] = diastolic_bps encounters = DataFrame( enc_dict, columns=['encounter_date', 'diastolic_bp', 'systolic_bp']) encounters = encounters.convert_objects(convert_dates='coerce', convert_numeric=True) #print encounters #encounters.is_copy = False enc_period = encounters[(encounters.encounter_date.dt.year >= 2004) & (encounters.encounter_date.dt.year <= 2009)] if enc_period.empty: #print "No data between given period" continue enc_period['mean_bp'] = enc_period['diastolic_bp'] #+ ((enc_period['systolic_bp']-enc_period['diastolic_bp'])/3) mbp = enc_period['mean_bp'].mean() sbp = enc_period['mean_bp'].std() if math.isnan(sbp): continue std_bp.append(sbp)
def clicksDataframe(clicks_data): clicks_dataframe = DataFrame(clicks_data, columns=['date', 'cardName', 'position', 'totalClicks', 'uniqueClicks']) clicks_dataframe = clicks_dataframe.convert_objects(convert_numeric=True) clicks_dataframe.drop('date', axis=1, inplace=True) return clicks_dataframe
def get_company_fundamental_fnguide(code): def g(x): if type(x) == str: return datetime.datetime.strptime(x, '%Y-%m-%d') else: return x # url = "http://comp.fnguide.com/SVO2/ASP/SVD_main.asp?pGB=1&gicode=A%s&cID=&MenuYn=Y&ReportGB=&NewMenuID=11&stkGb=&strResearchYN=" % (code) url = "http://asp01.fnguide.com/SVO2/ASP/SVD_Main.asp?pGB=1&gicode=A%s&NewMenuID=11&cID=50&MenuYn=N" % ( code) respstr = get_webpage(url, encoding="utf8") # soup = BeautifulSoup(respstr) soup = BeautifulSoup(respstr, "lxml") # <!--IFRS 별도/연간 --> target_table = soup.find("div", class_="um_table", id="highlight_B_Y") # print(target_table) result = [] try: target_table.find_all('tr') except Exception as e: return (DataFrame(), DataFrame()) for tr in target_table.find_all('tr'): # print("[%s]" % tr) for th in tr.find_all('th'): value = "%s" % th.text.replace('(P) : Provisional', '').replace( '(E) : Estimate', '').replace('잠정실적', '').replace( '컨센서스, 추정치', '').replace('(E)', '').replace( '(P)', '').replace('/', '-').strip() if ('-02' in value): value = value + '-28' elif ('-04' in value) or ('-06' in value) or ('-09' in value) or ( '-11' in value): value = value + '-30' elif ('-01' in value) or ('-03' in value) or ('-05' in value) or ( '-07' in value) or ('-08' in value) or ( '-10' in value) or ('-12' in value): value = value + '-31' result.append(value) # print("[%s]" % th.text.replace('(E) : Estimate','').replace('컨센서스, 추정치','').strip()) for td in tr.find_all('td'): value = td.text.strip().replace(',', '') try: value = float(value) except Exception as e: value = 0 result.append(value) # print(td.text.strip()) # print(result[1:]) result = result[1:] dfdata = [] for x in range(0, len(result), 9): dfdata.append(result[x:x + 9]) df = DataFrame(data=dfdata, columns=[str(x) for x in range(1, 10)]).T df.columns = [ '날짜', '매출액', '영업이익', '당기순이익', '자산총계', '부채총계', '자본총계', '자본금', '부채비율', '유보율', '영업이익률', '순이익률', 'ROA', 'ROE', 'EPS', 'BPS', 'DPS', 'PER', 'PBR', '발행주식수', '배당수익률' ] df.drop(df.index[[0]], inplace=True) # df['날짜'] = df['date'].apply(g) # df.drop(['date'], axis=1, inplace=True) df = df.convert_objects(convert_numeric=True) # df.set_index('날짜', inplace=True) df_year = df # <!--IFRS 별도/분기 --> target_table = soup.find("div", class_="um_table", id="highlight_B_Q") # print(target_table) result = [] for tr in target_table.find_all('tr'): # print("[%s]" % tr) for th in tr.find_all('th'): value = "%s" % th.text.replace('(P) : Provisional', '').replace( '(E) : Estimate', '').replace('잠정실적', '').replace( '컨센서스, 추정치', '').replace('(E)', '').replace( '(P)', '').replace('/', '-').strip() if ('-02' in value): value = value + '-28' elif ('-04' in value) or ('-06' in value) or ('-09' in value) or ( '-11' in value): value = value + '-30' elif ('-01' in value) or ('-03' in value) or ('-05' in value) or ( '-07' in value) or ('-08' in value) or ( '-10' in value) or ('-12' in value): value = value + '-31' result.append(value) # print("[%s]" % th.text.replace('(E) : Estimate','').replace('컨센서스, 추정치','').strip()) for td in tr.find_all('td'): value = td.text.strip().replace(',', '') try: value = float(value) except Exception as e: value = 0 result.append(value) # print(td.text.strip()) # print(result[1:]) result = result[1:] dfdata = [] for x in range(0, len(result), 9): dfdata.append(result[x:x + 9]) df = DataFrame(data=dfdata, columns=[str(x) for x in range(1, 10)]).T df.columns = [ '날짜', '매출액', '영업이익', '당기순이익', '자산총계', '부채총계', '자본총계', '자본금', '부채비율', '유보율', '영업이익률', '순이익률', 'ROA', 'ROE', 'EPS', 'BPS', 'DPS', 'PER', 'PBR', '발행주식수', '배당수익률' ] df.drop(df.index[[0]], inplace=True) # df['날짜'] = df['date'].apply(g) # df.drop(['date'], axis=1, inplace=True) df = df.convert_objects(convert_numeric=True) # df.set_index('날짜', inplace=True) df_qtr = df return (df_year, df_qtr)
# # for table_set in p.tables: # for table in table_set[1:]: # scoring_probability = float(table[7]) * float(table[-1]) # if scoring_probability > 15.0: # a = '%s %f | %s %f' % (table[1], scoring_probability, table[1], scoring_probability) # print(a) with open('table_example.html', encoding='utf-8') as file_: s = file_.read() p = HTMLTableParser() p.feed(s) table_set = array([table[1:] for table_group in p.tables for table in table_group]) # df = DataFrame(table_set[1:], columns=table_set[0]) df = df.convert_objects(convert_numeric=True) df['ZS'] = 9 for team, indexes in df.groupby('Tým').groups.items(): team_data_set = df.loc[indexes, ['Z', 'ZS']] print((team_data_set['ZS'] / team_data_set['Z']).sum()) # # for team in df.get('Tým').unique(): # # print(team) # multiplication_result = df.get('S/Z') * df.get('RÚS') # result_df = DataFrame({'Name': df.get('Jméno'), 'Team': df.get('Tým'), 'Probability':multiplication_result}) # for team, indexes in result_df.groupby('Team').groups.items(): # print(result_df.loc[indexes, ['Name', 'Probability']].set_index('Name').to_dict()['Probability']) # for table_set in p.tables: # for table in table_set[1:]: # scoring_probability = float(table[7]) * float(table[-1]) # if scoring_probability > 50.0: #
import numpy as np import matplotlib.pyplot as plt import sklearn from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.cross_validation import train_test_split #Import data from csv, group into columns for ease of use import csv import pandas as pd from pandas import DataFrame as df #f = open('auto-mpg.data.csv', 'rb') df = pd.read_csv('auto-mpg.data.csv') #print df.head df = df.convert_objects(convert_numeric=True) #df.info() y = df.mpg.astype(float) x = df.horsepower.astype(float) np.nan_to_num(x, copy=True) np.nan_to_num(y, copy=True) x = x.values.reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #print (y_train.shape) #print (x_train.shape) # Train the Linear Regression model and plot a prediction lr = LinearRegression() lr.fit(x_train, y_train)
df['resp']=1 #adds new column. #add the next trial thisTrial = trials.next() df= df.append( thisTrial, ignore_index=True ) #ignore because I got no index df['resp'][1]=0 print(df) #add the next trial thisTrial = trials.next() df= df.append( thisTrial, ignore_index=True ) df['resp'][2]=1 print(df) #Use pandas to calculate proportion correct at each level #The df.dtypes in my case are "objects". I don't know what that is and you can't take the mean df = df.convert_objects(convert_numeric=True) #convert dtypes from object to numeric #print('df='); print(df) #debug grouped = df.groupby('tilt') groupMeans= grouped.mean() #a groupBy object, kind of like a DataFrame but without column names, only an index? tiltsTested = list(groupMeans.index) pResp = list(groupMeans['resp']) #x.iloc[:] ns = grouped.sum() #want n per trial to scale data point size ns = list(ns['resp']) print('df mean at each tilt\n'); print( DataFrame({'tilt': tiltsTested, 'pResp': pResp, 'n': ns }) ) #data point sizes. One entry in array for each datapoint #def plotDataAndPsychometricCurve(staircase,fit,descendingPsycho,threshVal):
all_players.append(player_info) #Column headers for player dataframe player_col_header = ['player_id', 'last_name', 'first_name', 'country'] #Create a dataframe for keeping track of player info #every player starts with an elo rating of 1500 players = DataFrame(all_players, columns = player_col_header) players['current_elo'] = Series(1500, index=players.index) players['last_tourney_date'] = Series('N/A', index=players.index) players['matches_played'] = Series(0, index=players.index) players['peak_elo'] = Series(1500, index=players.index) players['peak_elo_date'] = Series('N/A', index=players.index) #Convert objects within dataframe to numeric players = players.convert_objects(convert_numeric=True) #Create an empty dataframe to store time series elo for top 10 players based on peak elo rating #Use player_id as the column header of the dataframe #Top ten players consist of: Djokovic, Federer, McEnroe, Nadal, Borg, Lendl, Becker, Murray, Sampras, Connors elo_timeseries_col_header = [104925, 103819, 100581, 104745, 100437, 100656, 101414, 104918, 101948, 100284] elo_timeseries = DataFrame(columns=elo_timeseries_col_header) #read through matches file for each year to update players data frame #starting from current_year current_year = 1968 for i in range((2015-1968)+1): current_year_file_name = 'atp_matches_'+ str(current_year) + '.csv' #read match CSV file and store important columns into lists
def totalHomePageViewsValue(data): homepage_view_dataframe = DataFrame(data,columns=['date','views','uniqueViews']) homepage_view_dataframe = homepage_view_dataframe.convert_objects(convert_numeric=True) return int(homepage_view_dataframe['uniqueViews'].sum())