Python DataFrame.convert_objects示例，pandas.DataFrame.convert_objects Python示例

示例#1

0

显示文件

文件： Project.py 项目： Sandeep-Joshi/stocks-comparison

def fetchQuotes(sym, start=FROM_DATE, end=CURRENT_DATE):
    his = None
    data = None
    try:
        # print start, end
        data = ystockquote.get_historical_prices(sym, start, end)
    except Exception:
        print "Please check the dates. Data might not be available. 404 returned"

        # 404 due to data yet not available
    if data:
        his = DataFrame(collections.OrderedDict(sorted(data.items()))).T
        his = his.convert_objects(convert_numeric=True)
        his.index = pd.to_datetime(his.index)
        his.insert(0, 'symbol', sym, allow_duplicates=True)
        # insert the date as dataframe too
        his.insert(1, 'date', his.index)
        # his.columns = getColumns('stock_quote_historical')   # Removing as db dependency is removed
        his.columns = getColumnsNoSql('stock_quote_historical')

    daily = ystockquote.get_all(sym)
    # print daily
    # persist(his, daily, sym, end)

    return his, daily

示例#2

0

显示文件

文件： Project.py 项目： Sandeep-Joshi/stocks-comparison

def fetchQuotes(sym, start=FROM_DATE, end=CURRENT_DATE):
    his = None
    data = None
    try:
        # print start, end
        data = ystockquote.get_historical_prices(sym, start, end)
    except Exception:
        print "Please check the dates. Data might not be available. 404 returned"

        # 404 due to data yet not available
    if data:
        his = DataFrame(collections.OrderedDict(sorted(data.items()))).T
        his = his.convert_objects(convert_numeric=True)
        his.index = pd.to_datetime(his.index)
        his.insert(0, 'symbol', sym, allow_duplicates=True)
        # insert the date as dataframe too
        his.insert(1, 'date', his.index)
        # his.columns = getColumns('stock_quote_historical')   # Removing as db dependency is removed
        his.columns = getColumnsNoSql('stock_quote_historical')

    daily = ystockquote.get_all(sym)
    # print daily
    # persist(his, daily, sym, end)

    return his, daily

示例#3

0

显示文件

def homePageToSubjectPageDataframe(data):
    subject_dataframe = DataFrame(data,columns=['date','page_title','views','uniqueViews'])
    subject_dataframe = subject_dataframe.convert_objects(convert_numeric=True)
    subject_dataframe.drop('date', axis=1, inplace=True)
    subject_dataframe = subject_dataframe.groupby(['page_title']).sum().sort(['uniqueViews'],ascending=0)
    subject_dataframe.reset_index(inplace=True)
    subject_dataframe['subject'] = subject_dataframe['page_title'].apply(lambda title: strip_edx_page_title(title))
    subject_dataframe['totalViews'] = subject_dataframe['uniqueViews'].sum()
    subject_dataframe['Pct'] = (subject_dataframe['uniqueViews'] / subject_dataframe['totalViews'])
    subject_dataframe = subject_dataframe[(subject_dataframe['Pct']>0.0001)]

    return subject_dataframe[['subject','uniqueViews','Pct']]

示例#4

0

显示文件

文件： main.py 项目： sonnenfeldt/galibier

def get_hosts():
    csv_file = "./hosts.csv"
    print "Hosts input file: %s" % csv_file

    hosts_raw = pd.read_csv(csv_file, sep=',', names=['#', 'Provider', 'Region', 'CPUs', 'Memory', 
                                                  'Storage', 'DiskType', 'OS', 'Cost','Comment'])

    hosts = DataFrame(hosts_raw)
    hosts = hosts[1:]
    hosts = hosts.convert_objects(convert_numeric=True)
    
    print hosts
    
    return hosts

示例#5

0

显示文件

def enrollmentDataframe(enrolls_data, card_type, enroll_type):
    enrolls_dataframe = DataFrame(
        enrolls_data,
        columns=[
            'date',
            'cardName',
            'position',
            'total{type}Enrolls'.format(type=enroll_type),
            'unique{type}Enrolls'.format(type=enroll_type)
        ]
    )

    enrolls_dataframe = enrolls_dataframe.convert_objects(convert_numeric=True)
    enrolls_dataframe.drop('date', axis=1, inplace=True)
    enrolls_dataframe['type'] = card_type

    return enrolls_dataframe

示例#6

0

显示文件

def get_hosts():
    csv_file = "./hosts.csv"
    print "Hosts input file: %s" % csv_file

    hosts_raw = pd.read_csv(csv_file,
                            sep=',',
                            names=[
                                '#', 'Provider', 'Region', 'CPUs', 'Memory',
                                'Storage', 'DiskType', 'OS', 'Cost', 'Comment'
                            ])

    hosts = DataFrame(hosts_raw)
    hosts = hosts[1:]
    hosts = hosts.convert_objects(convert_numeric=True)

    print hosts

    return hosts

示例#7

0

显示文件

文件： SystolicAndDiastolic.py 项目： Semantic-Web/Roxana-O

            value_quant = observation.find('{http://hl7.org/fhir}valueQuantity')
            value = value_quant.find('{http://hl7.org/fhir}value')
            diastolic_bps.append(value.get('value'))
    
    if (not systolic_bps) & (not diastolic_bps):
        #print "No Systolic/Diastolic BP"
        continue
    if((len(encounter_dates)!=len(systolic_bps)) | (len(systolic_bps) != len(diastolic_bps))):
        continue

    enc_dict = {}
    enc_dict['encounter_date'] = encounter_dates
    enc_dict['systolic_bp'] = systolic_bps
    enc_dict['diastolic_bp'] = diastolic_bps
    encounters = DataFrame(enc_dict, columns=['encounter_date', 'diastolic_bp', 'systolic_bp'])
    encounters = encounters.convert_objects(convert_dates='coerce', convert_numeric=True)
    #print encounters
    #encounters.is_copy = False
    enc_period = encounters[(encounters.encounter_date.dt.year >= 2004) & (encounters.encounter_date.dt.year <= 2009)]
    if enc_period.empty:
        #print "No data between given period"
        continue
   
    enc_period['mean_bp'] = enc_period['diastolic_bp']
    #+ ((enc_period['systolic_bp']-enc_period['diastolic_bp'])/3)
    mbp = enc_period['mean_bp'].mean()
    sbp = enc_period['mean_bp'].std()
    if math.isnan(sbp):
        continue
    std_bp.append(sbp)
    count = count +1

示例#8

0

显示文件

文件： html.py 项目： APWaldo/pandas

def _data_to_frame(data, header, index_col, infer_types, skiprows):
    """Parse a BeautifulSoup table into a DataFrame.

    Parameters
    ----------
    data : tuple of lists
        The raw data to be placed into a DataFrame. This is a list of lists of
        strings or unicode. If it helps, it can be thought of as a matrix of
        strings instead.

    header : int or None
        An integer indicating the row to use for the column header or None
        indicating no header will be used.

    index_col : int or None
        An integer indicating the column to use for the index or None
        indicating no column will be used.

    infer_types : bool
        Whether to convert numbers and dates.

    skiprows : collections.Container or int or slice
        Iterable used to skip rows.

    Returns
    -------
    df : DataFrame
        A DataFrame containing the data from `data`

    Raises
    ------
    ValueError
        * If `skiprows` is not found in the rows of the parsed DataFrame.

    Raises
    ------
    ValueError
        * If `skiprows` is not found in the rows of the parsed DataFrame.

    See Also
    --------
    read_html

    Notes
    -----
    The `data` parameter is guaranteed not to be a list of empty lists.
    """
    thead, tbody, tfoot = data
    columns = thead or None
    df = DataFrame(tbody, columns=columns)

    if skiprows is not None:
        it = _get_skiprows_iter(skiprows)

        try:
            df = df.drop(it)
        except ValueError:
            raise ValueError('Labels {0} not found when trying to skip'
                             ' rows'.format(it))

    # convert to numbers/dates where possible
    # must be sequential since dates trump numbers if both args are given
    if infer_types:
        df = df.convert_objects(convert_numeric=True)
        df = df.convert_objects(convert_dates='coerce')

    if header is not None:
        header_rows = df.iloc[header]

        if header_rows.ndim == 2:
            names = header_rows.index
            df.columns = MultiIndex.from_arrays(header_rows.values,
                                                names=names)
        else:
            df.columns = header_rows

        df = df.drop(df.index[header])

    if index_col is not None:
        cols = df.columns[index_col]

        try:
            cols = cols.tolist()
        except AttributeError:
            pass

        # drop by default
        df.set_index(cols, inplace=True)
        if df.index.nlevels == 1:
            if isnull(df.index.name) or not df.index.name:
                df.index.name = None
        else:
            names = [name or None for name in df.index.names]
            df.index = MultiIndex.from_tuples(df.index.values, names=names)

    return df

示例#9

0

显示文件

文件： Visualizations.py 项目： Kyxsune/Robinhood_Model

def Get_dataframe(Symbol):
    Symbolx = str(Symbol) + '_history'
    Db_cursor = MongoClient()['stox'][Symbolx].find()
    x = DataFrame(list(Db_cursor))
    x = x.convert_objects(convert_numeric=True)
    return x

示例#10

0

显示文件

文件： SystolicAndDiastolic.py 项目： Semantic-Web/Roxana-O

            diastolic_bps.append(value.get('value'))

    if (not systolic_bps) & (not diastolic_bps):
        #print "No Systolic/Diastolic BP"
        continue
    if ((len(encounter_dates) != len(systolic_bps)) |
        (len(systolic_bps) != len(diastolic_bps))):
        continue

    enc_dict = {}
    enc_dict['encounter_date'] = encounter_dates
    enc_dict['systolic_bp'] = systolic_bps
    enc_dict['diastolic_bp'] = diastolic_bps
    encounters = DataFrame(
        enc_dict, columns=['encounter_date', 'diastolic_bp', 'systolic_bp'])
    encounters = encounters.convert_objects(convert_dates='coerce',
                                            convert_numeric=True)
    #print encounters
    #encounters.is_copy = False
    enc_period = encounters[(encounters.encounter_date.dt.year >= 2004)
                            & (encounters.encounter_date.dt.year <= 2009)]
    if enc_period.empty:
        #print "No data between given period"
        continue

    enc_period['mean_bp'] = enc_period['diastolic_bp']
    #+ ((enc_period['systolic_bp']-enc_period['diastolic_bp'])/3)
    mbp = enc_period['mean_bp'].mean()
    sbp = enc_period['mean_bp'].std()
    if math.isnan(sbp):
        continue
    std_bp.append(sbp)

示例#11

0

显示文件

def clicksDataframe(clicks_data):
    clicks_dataframe = DataFrame(clicks_data, columns=['date', 'cardName', 'position', 'totalClicks', 'uniqueClicks'])
    clicks_dataframe = clicks_dataframe.convert_objects(convert_numeric=True)
    clicks_dataframe.drop('date', axis=1, inplace=True)

    return clicks_dataframe

示例#12

0

显示文件

文件： 재무정보수집.py 项目： oms1226/msbot

def get_company_fundamental_fnguide(code):
    def g(x):
        if type(x) == str:
            return datetime.datetime.strptime(x, '%Y-%m-%d')
        else:
            return x

    # url = "http://comp.fnguide.com/SVO2/ASP/SVD_main.asp?pGB=1&gicode=A%s&cID=&MenuYn=Y&ReportGB=&NewMenuID=11&stkGb=&strResearchYN=" % (code)
    url = "http://asp01.fnguide.com/SVO2/ASP/SVD_Main.asp?pGB=1&gicode=A%s&NewMenuID=11&cID=50&MenuYn=N" % (
        code)
    respstr = get_webpage(url, encoding="utf8")
    # soup = BeautifulSoup(respstr)
    soup = BeautifulSoup(respstr, "lxml")

    # <!--IFRS 별도/연간 -->
    target_table = soup.find("div", class_="um_table", id="highlight_B_Y")
    # print(target_table)
    result = []

    try:
        target_table.find_all('tr')
    except Exception as e:
        return (DataFrame(), DataFrame())

    for tr in target_table.find_all('tr'):
        #     print("[%s]" % tr)
        for th in tr.find_all('th'):
            value = "%s" % th.text.replace('(P) : Provisional', '').replace(
                '(E) : Estimate', '').replace('잠정실적', '').replace(
                    '컨센서스, 추정치', '').replace('(E)', '').replace(
                        '(P)', '').replace('/', '-').strip()
            if ('-02' in value):
                value = value + '-28'
            elif ('-04' in value) or ('-06' in value) or ('-09' in value) or (
                    '-11' in value):
                value = value + '-30'
            elif ('-01' in value) or ('-03' in value) or ('-05' in value) or (
                    '-07' in value) or ('-08' in value) or (
                        '-10' in value) or ('-12' in value):
                value = value + '-31'
            result.append(value)
    #         print("[%s]" % th.text.replace('(E) : Estimate','').replace('컨센서스, 추정치','').strip())
        for td in tr.find_all('td'):
            value = td.text.strip().replace(',', '')
            try:
                value = float(value)
            except Exception as e:
                value = 0
            result.append(value)
    #         print(td.text.strip())

    # print(result[1:])
    result = result[1:]
    dfdata = []
    for x in range(0, len(result), 9):
        dfdata.append(result[x:x + 9])
    df = DataFrame(data=dfdata, columns=[str(x) for x in range(1, 10)]).T
    df.columns = [
        '날짜', '매출액', '영업이익', '당기순이익', '자산총계', '부채총계', '자본총계', '자본금', '부채비율',
        '유보율', '영업이익률', '순이익률', 'ROA', 'ROE', 'EPS', 'BPS', 'DPS', 'PER',
        'PBR', '발행주식수', '배당수익률'
    ]
    df.drop(df.index[[0]], inplace=True)
    # df['날짜'] = df['date'].apply(g)
    # df.drop(['date'], axis=1, inplace=True)
    df = df.convert_objects(convert_numeric=True)
    # df.set_index('날짜', inplace=True)

    df_year = df

    # <!--IFRS 별도/분기 -->
    target_table = soup.find("div", class_="um_table", id="highlight_B_Q")
    # print(target_table)
    result = []
    for tr in target_table.find_all('tr'):
        #     print("[%s]" % tr)
        for th in tr.find_all('th'):
            value = "%s" % th.text.replace('(P) : Provisional', '').replace(
                '(E) : Estimate', '').replace('잠정실적', '').replace(
                    '컨센서스, 추정치', '').replace('(E)', '').replace(
                        '(P)', '').replace('/', '-').strip()
            if ('-02' in value):
                value = value + '-28'
            elif ('-04' in value) or ('-06' in value) or ('-09' in value) or (
                    '-11' in value):
                value = value + '-30'
            elif ('-01' in value) or ('-03' in value) or ('-05' in value) or (
                    '-07' in value) or ('-08' in value) or (
                        '-10' in value) or ('-12' in value):
                value = value + '-31'
            result.append(value)
    #         print("[%s]" % th.text.replace('(E) : Estimate','').replace('컨센서스, 추정치','').strip())
        for td in tr.find_all('td'):
            value = td.text.strip().replace(',', '')
            try:
                value = float(value)
            except Exception as e:
                value = 0
            result.append(value)
    #         print(td.text.strip())

    # print(result[1:])
    result = result[1:]
    dfdata = []
    for x in range(0, len(result), 9):
        dfdata.append(result[x:x + 9])
    df = DataFrame(data=dfdata, columns=[str(x) for x in range(1, 10)]).T
    df.columns = [
        '날짜', '매출액', '영업이익', '당기순이익', '자산총계', '부채총계', '자본총계', '자본금', '부채비율',
        '유보율', '영업이익률', '순이익률', 'ROA', 'ROE', 'EPS', 'BPS', 'DPS', 'PER',
        'PBR', '발행주식수', '배당수익률'
    ]
    df.drop(df.index[[0]], inplace=True)
    # df['날짜'] = df['date'].apply(g)
    # df.drop(['date'], axis=1, inplace=True)
    df = df.convert_objects(convert_numeric=True)
    # df.set_index('날짜', inplace=True)

    df_qtr = df

    return (df_year, df_qtr)

示例#13

0

显示文件

文件： hockey_prediction.py 项目： ahcub/hokey_prediction_app

    #
    # for table_set in p.tables:
    #     for table in table_set[1:]:
    #         scoring_probability = float(table[7]) * float(table[-1])
    #         if scoring_probability > 15.0:
    #             a = '%s %f | %s %f' % (table[1], scoring_probability, table[1], scoring_probability)
    #             print(a)

    with open('table_example.html', encoding='utf-8') as file_:
        s = file_.read()
        p = HTMLTableParser()
        p.feed(s)
        table_set = array([table[1:] for table_group in p.tables for table in table_group])
    #
        df = DataFrame(table_set[1:], columns=table_set[0])
        df = df.convert_objects(convert_numeric=True)
        df['ZS'] = 9
        for team, indexes in df.groupby('Tým').groups.items():
            team_data_set = df.loc[indexes, ['Z', 'ZS']]
            print((team_data_set['ZS'] / team_data_set['Z']).sum())
    #     # for team in df.get('Tým').unique():
    #     #     print(team)
    #     multiplication_result = df.get('S/Z') * df.get('RÚS')
    #     result_df = DataFrame({'Name': df.get('Jméno'), 'Team': df.get('Tým'), 'Probability':multiplication_result})
    #     for team, indexes in result_df.groupby('Team').groups.items():
    #         print(result_df.loc[indexes, ['Name', 'Probability']].set_index('Name').to_dict()['Probability'])
        # for table_set in p.tables:
        #     for table in table_set[1:]:
        #         scoring_probability = float(table[7]) * float(table[-1])
        #         if scoring_probability > 50.0:
        #

示例#14

0

显示文件

文件： poly.py 项目： Dalmari/Bootcamp-Portfolio

import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cross_validation import train_test_split

#Import data from csv, group into columns for ease of use
import csv
import pandas as pd
from pandas import DataFrame as df

#f = open('auto-mpg.data.csv', 'rb')
df = pd.read_csv('auto-mpg.data.csv')
#print df.head
df = df.convert_objects(convert_numeric=True)
#df.info()

y = df.mpg.astype(float)
x = df.horsepower.astype(float)
np.nan_to_num(x, copy=True)
np.nan_to_num(y, copy=True)
x = x.values.reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
#print (y_train.shape)
#print (x_train.shape)

# Train the Linear Regression model and plot a prediction
lr = LinearRegression()
lr.fit(x_train, y_train)

示例#15

0

显示文件

文件： dataframesTest.py 项目： alexholcombe/spatiotopic-motion

df['resp']=1 #adds new column. 


#add the next trial
thisTrial = trials.next()
df= df.append( thisTrial, ignore_index=True ) #ignore because I got no index
df['resp'][1]=0
print(df)

#add the next trial
thisTrial = trials.next()
df= df.append( thisTrial, ignore_index=True )
df['resp'][2]=1
print(df)

#Use pandas to calculate proportion correct at each level
#The df.dtypes in my case are  "objects". I don't know what that is and you can't take the mean
df = df.convert_objects(convert_numeric=True) #convert dtypes from object to numeric

#print('df='); print(df) #debug
grouped = df.groupby('tilt')
groupMeans= grouped.mean() #a groupBy object, kind of like a DataFrame but without column names, only an index?
tiltsTested = list(groupMeans.index)
pResp = list(groupMeans['resp'])  #x.iloc[:]
ns = grouped.sum() #want n per trial to scale data point size
ns = list(ns['resp'])
print('df mean at each tilt\n'); print(  DataFrame({'tilt': tiltsTested, 'pResp': pResp, 'n': ns })   )
#data point sizes. One entry in array for each datapoint
    
#def plotDataAndPsychometricCurve(staircase,fit,descendingPsycho,threshVal):

示例#16

0

显示文件

		all_players.append(player_info)

#Column headers for player dataframe
player_col_header = ['player_id', 'last_name', 'first_name', 'country']

#Create a dataframe for keeping track of player info
#every player starts with an elo rating of 1500
players = DataFrame(all_players, columns = player_col_header)
players['current_elo'] = Series(1500, index=players.index)
players['last_tourney_date'] = Series('N/A', index=players.index)
players['matches_played'] = Series(0, index=players.index)
players['peak_elo'] = Series(1500, index=players.index)
players['peak_elo_date'] = Series('N/A', index=players.index)

#Convert objects within dataframe to numeric
players = players.convert_objects(convert_numeric=True)

#Create an empty dataframe to store time series elo for top 10 players based on peak elo rating
#Use player_id as the column header of the dataframe
#Top ten players consist of: Djokovic, Federer, McEnroe, Nadal, Borg, Lendl, Becker, Murray, Sampras, Connors 
elo_timeseries_col_header = [104925, 103819, 100581, 104745, 100437, 100656, 101414, 104918, 101948, 100284]
elo_timeseries = DataFrame(columns=elo_timeseries_col_header)

#read through matches file for each year to update players data frame
#starting from current_year
current_year = 1968

for i in range((2015-1968)+1):
	current_year_file_name = 'atp_matches_'+ str(current_year) + '.csv'

	#read match CSV file and store important columns into lists

示例#17

0

显示文件

def _data_to_frame(data, header, index_col, infer_types, skiprows):
    """Parse a BeautifulSoup table into a DataFrame.

    Parameters
    ----------
    data : tuple of lists
        The raw data to be placed into a DataFrame. This is a list of lists of
        strings or unicode. If it helps, it can be thought of as a matrix of
        strings instead.

    header : int or None
        An integer indicating the row to use for the column header or None
        indicating no header will be used.

    index_col : int or None
        An integer indicating the column to use for the index or None
        indicating no column will be used.

    infer_types : bool
        Whether to convert numbers and dates.

    skiprows : collections.Container or int or slice
        Iterable used to skip rows.

    Returns
    -------
    df : DataFrame
        A DataFrame containing the data from `data`

    Raises
    ------
    ValueError
        * If `skiprows` is not found in the rows of the parsed DataFrame.

    Raises
    ------
    ValueError
        * If `skiprows` is not found in the rows of the parsed DataFrame.

    See Also
    --------
    read_html

    Notes
    -----
    The `data` parameter is guaranteed not to be a list of empty lists.
    """
    thead, tbody, tfoot = data
    columns = thead or None
    df = DataFrame(tbody, columns=columns)

    if skiprows is not None:
        it = _get_skiprows_iter(skiprows)

        try:
            df = df.drop(it)
        except ValueError:
            raise ValueError('Labels {0} not found when trying to skip'
                             ' rows'.format(it))

    # convert to numbers/dates where possible
    # must be sequential since dates trump numbers if both args are given
    if infer_types:
        df = df.convert_objects(convert_numeric=True)
        df = df.convert_objects(convert_dates='coerce')

    if header is not None:
        header_rows = df.iloc[header]

        if header_rows.ndim == 2:
            names = header_rows.index
            df.columns = MultiIndex.from_arrays(header_rows.values,
                                                names=names)
        else:
            df.columns = header_rows

        df = df.drop(df.index[header])

    if index_col is not None:
        cols = df.columns[index_col]

        try:
            cols = cols.tolist()
        except AttributeError:
            pass

        # drop by default
        df.set_index(cols, inplace=True)
        if df.index.nlevels == 1:
            if isnull(df.index.name) or not df.index.name:
                df.index.name = None
        else:
            names = [name or None for name in df.index.names]
            df.index = MultiIndex.from_tuples(df.index.values, names=names)

    return df

示例#18

0

显示文件

def totalHomePageViewsValue(data):
    homepage_view_dataframe = DataFrame(data,columns=['date','views','uniqueViews'])
    homepage_view_dataframe = homepage_view_dataframe.convert_objects(convert_numeric=True)
    return int(homepage_view_dataframe['uniqueViews'].sum())