Exemplo n.º 1
0
def atr(df,periods=14,high='high',low='low',close='close',include=True,str='{name}({period})',**kwargs):
	def _atr(df,periods,high,low,close,include,str,detail=False):
		study='ATR'
		_df=pd.DataFrame()
		## === talib ==== 
		# _df['ATR']=pd.Series(talib.ATR(df[high].values,
		# 							   df[low].values,
		# 							   df[close].values,
		# 							   periods),index=df.index)
		## === /talib ==== 

		## === pure python ==== 
		_df['HmL']=df[high]-df[low]
		_df['HmC']=abs(df[high]-df[close].shift(1))
		_df['LmC']=abs(df[low]-df[close].shift(1))
		_df['TR']=_df.apply(max,axis=1)
		_df['ATR']=_df['TR'].rolling(periods).mean()
		## === /pure python ==== 
		return rename(df,_df,study,periods,'',include,str,detail)
	periods=make_list(periods)
	__df=pd.concat([_atr(df,periods=y,high=high,low=low,close=close,include=False,str=str) for y in periods],axis=1)
	if include:
		return pd.concat([df,__df],axis=1)
	else:
		return __df
Exemplo n.º 2
0
def pickle_trialDataSource():
    '''
    trialDataSource is converted to a dataframe: gazeEventsDF, and then pickled
    return:  void
    '''

    global gazeEventsDF

    trialEventsDF = pd.DataFrame()

    for key, source in trialSourceDict.items():

        if key != "index":
            eventDF = source.to_df()
            eventDF['eventType'] = key
            trialEventsDF = pd.concat([eventDF, trialEventsDF], axis=0)

    if gazeEventsDF is False:
        pd.to_pickle(trialEventsDF, eventPickleLoc)
    else:
        # Remove old records from current trial from gazeEventsDF
        gazeEventsDF = gazeEventsDF[gazeEventsDF['trialNum'] != trialNum]
        # Add new data
        gazeEventsDF = pd.concat([gazeEventsDF, trialEventsDF], axis=0)
        pd.to_pickle(gazeEventsDF, eventPickleLoc)
Exemplo n.º 3
0
    def get_payment_method_details(self, *args):

        """
        A banner usually gives several payment options for users.
        This function returns a dataframe showing how many people clicked on each payment method, 
        how many successful donations came from each payment method,
        the percent of donations that came from each method,
        the total raised for each method,
        the average raised for reach method, where outliers where removed
        """

        # set up list of banner to process
        if len(args) == 0:
            names = self.names
        else:
            names = args


        ds = []

        #Define the metrics in the order requested by Megan
        column_order = [
        'name',
        'donations',
        'clicks',
        'conversion_rate',
        'percent clicked on',
        'percent donated on',
        'total_amount',
        'ave_amount_ro'
        ]
        # Step through metrics and compute them for each banner

        for name in names:

            clicks = self.data[name]['clicks']['payment_method'].value_counts()
            donations = self.data[name]['donations']['payment_method'].value_counts()
            donations_sum = self.data[name]['donations'].groupby(['payment_method']).apply(lambda x: x.amount.sum())
            ave = self.data[name]['clean_donations'].groupby(['payment_method']).apply(lambda x: x.amount.mean())
            df = pd.concat([donations, clicks, ave, donations_sum], axis=1)
            df.columns = ['donations', 'clicks', 'ave_amount_ro', 'total_amount']

            # metrics computed from above metrics
            df['conversion_rate'] = 100* df['donations'] / df['clicks']
            df['percent clicked on'] = 100*df['clicks'] / df['clicks'].sum()
            df['percent donated on'] = 100*df['donations'] / df['donations'].sum()
            df['name'] = name

            #Put the metrics in the order requested by Megan

            df = df[column_order]
            ds.append(df)


        df = pd.concat(ds)
        df.index = pd.MultiIndex.from_tuples(zip(df['name'], df.index))
        del df['name']
        df = df.sort()

        return df
Exemplo n.º 4
0
def cci(df,periods=14,high='high',low='low',close='close',include=True,str='{name}({period})',**kwargs):
	def _cci(df,periods,high,low,close,include,str,detail=False):
		study='CCI'
		_df=pd.DataFrame()
		## === talib ==== 
		# _df['CCI']=pd.Series(talib.CCI(df[high].values,
		# 							   df[low].values,
		# 							   df[close].values,
		# 							   periods),index=df.index)
		## === /talib ==== 

		## === pure python ==== 
		_df['tp']=df[[low,high,close]].mean(axis=1)
		_df['avgTp']=_df['tp'].rolling(window=periods).mean()
		mad = lambda x: np.fabs(x - x.mean()).mean()
		_df['mad']=_df['tp'].rolling(window=periods).apply(mad)
		_df['CCI']=(_df['tp']-_df['avgTp'])/(0.015*_df['mad'])
		## === /pure python ==== 

		return rename(df,_df,study,periods,'',include,str,detail)
	periods=make_list(periods)
	__df=pd.concat([_cci(df,periods=y,high=high,low=low,close=close,include=False,str=str) for y in periods],axis=1)
	if include:
		return pd.concat([df,__df],axis=1)
	else:
		return __df
Exemplo n.º 5
0
def getData(folderList, shapes, trips, stopTimes, calendar, frequencies):
    for folder in folderList:
        print('Adding data from ' + folder + '.')

        # Read the files from the data.
        readShapes = pd.read_csv('../' + folder + '/shapes.txt')[shapeData]
        readTrips = pd.read_csv('../' + folder + '/trips.txt')[routeData]
        readStopTimes = pd.read_csv('../' + folder + '/stop_times.txt')[timeData]
        readCalendar = pd.read_csv('../' + folder + '/calendar.txt')[calendarData]

        # Append it to the existing data.
        shapes = pd.concat([shapes, readShapes])
        trips = pd.concat([trips, readTrips])
        stopTimes = pd.concat([stopTimes, readStopTimes])
        calendar = pd.concat([calendar, readCalendar])

        if os.path.isfile('../' + folder + '/frequencies.txt'):
            readFrequencies = pd.read_csv('../' + folder + '/frequencies.txt')
            frequencies = pd.concat([frequencies, readFrequencies])

         # Calculate the number of missing shapes.
        num_shapes = trips.groupby('route_id').size()
        num_validshapes = trips[trips.shape_id.isin(shapes.shape_id)].groupby('route_id').size()
        num_missingshapes = num_shapes - num_validshapes
        percent_missingshapes = num_missingshapes / num_shapes * 100
        print('Missing data from ' + folder + ':')
        num_missingshapesList = num_missingshapes[num_missingshapes != 0]
        if num_missingshapes.empty:
            print(num_missingshapes[num_missingshapes != 0])
            print(percent_missingshapes[percent_missingshapes != 0])
        else:
            print('No data missing.\n')

    return lists(shapes, trips, stopTimes, calendar, frequencies)
Exemplo n.º 6
0
def getDummiesInplace(columnList, train, test = None):
    #Takes in a list of column names and one or two pandas dataframes
    #One-hot encodes all indicated columns inplace
    columns = []
    
    if test is not None:
        df = pd.concat([train,test], axis= 0)
    else:
        df = train
        
    for columnName in df.columns:
        index = df.columns.get_loc(columnName)
        if columnName in columnList:
            dummies = pd.get_dummies(df.ix[:,index], prefix = columnName, prefix_sep = ".")
            columns.append(dummies)
        else:
            columns.append(df.ix[:,index])
    df = pd.concat(columns, axis = 1)
    
    if test is not None:
        train = df[:train.shape[0]]
        test = df[train.shape[0]:]
        return train, test
    else:
        train = df
        return train
def parse_sub(sub, office, district):
    sub = sub.reset_index(drop=True)

    # Special case these. Needs to be cleaned up and generalized.
    if (office, district) == ('U.S. House', '33'):
        sub = pd.concat([sub.iloc[0:4,   0:-1].reset_index(drop=True),
                         sub.iloc[5:9,   1:-1].reset_index(drop=True),
                         sub.iloc[10:14, 1:].reset_index(drop=True)], axis=1).dropna(how='all')
    elif (office, district) == ('State Assembly', '33'):
        sub = pd.concat([sub.iloc[0:4, 0:-1].reset_index(drop=True),
                         sub.iloc[5:9, 1:].reset_index(drop=True)], axis=1).dropna(how='all')
    elif (office, district) == ('U.S. House', '24'):
        sub = pd.concat([sub.iloc[0:6,  0:-1].reset_index(drop=True),
                         sub.iloc[7:13, 1:].reset_index(drop=True)], axis=1).dropna(how='all')

    sub.columns = ['county'] + \
        sub.iloc[:, 1:-1].iloc[0].fillna('').tolist() + ['office']
    sub = sub.dropna(axis=1, how='all')
    sub = sub.rename(columns=parse_candidate)
    parties = sub.iloc[:, 1:-1].iloc[1].to_dict()
    sub = sub[sub.county.isin(COUNTIES)]
    sub = pd.melt(sub, id_vars=['county', 'office'], value_vars=sub.columns.tolist()[
        1:-1], var_name='candidate', value_name='votes')
    sub['party'] = sub.candidate.apply(lambda x: parties[x])
    sub = sub.assign(office=office, district=district)
    return sub[fieldnames]
Exemplo n.º 8
0
    def test_iloc_non_unique_indexing(self):

        # GH 4017, non-unique indexing (on the axis)
        df = DataFrame({'A': [0.1] * 3000, 'B': [1] * 3000})
        idx = np.array(lrange(30)) * 99
        expected = df.iloc[idx]

        df3 = pd.concat([df, 2 * df, 3 * df])
        result = df3.iloc[idx]

        tm.assert_frame_equal(result, expected)

        df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000})
        df2 = pd.concat([df2, 2 * df2, 3 * df2])

        sidx = df2.index.to_series()
        expected = df2.iloc[idx[idx <= sidx.max()]]

        new_list = []
        for r, s in expected.iterrows():
            new_list.append(s)
            new_list.append(s * 2)
            new_list.append(s * 3)

        expected = DataFrame(new_list)
        expected = pd.concat([expected, DataFrame(index=idx[idx > sidx.max()])
                              ])
        result = df2.loc[idx]
        tm.assert_frame_equal(result, expected, check_index_type=False)
Exemplo n.º 9
0
def iterate_weather_files():
    new_dates = pd.date_range(str(constants.start_yr)+'-01-01',str(constants.until_yr)+'-12-31',freq='D')

    # Iterate through all daily weather files
    for fl in glob.iglob(os.path.join(constants.wth_dir, '*.txt')):
        print os.path.basename(fl)
        inp_df,ix_df = read_input_data(fl)

        # Create output climatology file
        frames = [compute_climatology(col_num,inp_df,ix_df,new_dates) for col_num in xrange(3,len(inp_df.columns))]
        result = pd.concat(frames,axis=1)

        # Add year, month and day columns (1st 3 columns)
        result.columns = xrange(3,len(inp_df.columns))
        result[0] = result.index.year
        result[1] = result.index.month
        result[2] = result.index.day

        comb_df   = pd.concat([inp_df,result])

        # Output to new weather file
        epic_out  = open(constants.out_dir+os.sep+os.path.basename(fl),'w')
        for index, row in comb_df.iterrows():
            epic_out.write(('%6d%4d%4d'+6*'%6.2f'+'\n') %
                        (int(row[0]),int(row[1]),int(row[2]),
                         float(row[3]),float(row[4]),float(row[5]),
                         float(row[6]),float(row[7]),float(row[8])))
        epic_out.close()
Exemplo n.º 10
0
    def test_categorical_writing(self):
        original = DataFrame.from_records(
            [
                ["one", "ten", "one", "one", "one", 1],
                ["two", "nine", "two", "two", "two", 2],
                ["three", "eight", "three", "three", "three", 3],
                ["four", "seven", 4, "four", "four", 4],
                ["five", "six", 5, np.nan, "five", 5],
                ["six", "five", 6, np.nan, "six", 6],
                ["seven", "four", 7, np.nan, "seven", 7],
                ["eight", "three", 8, np.nan, "eight", 8],
                ["nine", "two", 9, np.nan, "nine", 9],
                ["ten", "one", "ten", np.nan, "ten", 10]
            ],
            columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled',
                     'labeled_with_missings', 'float_labelled', 'unlabeled'])
        expected = original.copy()

        # these are all categoricals
        original = pd.concat([original[col].astype('category') for col in original], axis=1)

        expected['incompletely_labeled'] = expected['incompletely_labeled'].apply(str)
        expected['unlabeled'] = expected['unlabeled'].apply(str)
        expected = pd.concat([expected[col].astype('category') for col in expected], axis=1)
        expected.index.name = 'index'

        with tm.ensure_clean() as path:
            with warnings.catch_warnings(record=True) as w:
                # Silence warnings
                original.to_stata(path)
                written_and_read_again = self.read_dta(path)
                tm.assert_frame_equal(written_and_read_again.set_index('index'), expected)
Exemplo n.º 11
0
    def test_categorical_warnings_and_errors(self):
        # Warning for non-string labels
        # Error for labels too long
        original = pd.DataFrame.from_records(
            [['a' * 10000],
             ['b' * 10000],
             ['c' * 10000],
             ['d' * 10000]],
            columns=['Too_long'])

        original = pd.concat([original[col].astype('category') for col in original], axis=1)
        with tm.ensure_clean() as path:
            tm.assertRaises(ValueError, original.to_stata, path)

        original = pd.DataFrame.from_records(
            [['a'],
             ['b'],
             ['c'],
             ['d'],
             [1]],
            columns=['Too_long'])
        original = pd.concat([original[col].astype('category') for col in original], axis=1)

        with warnings.catch_warnings(record=True) as w:
            original.to_stata(path)
            tm.assert_equal(len(w), 1)  # should get a warning for mixed content
Exemplo n.º 12
0
def preprocess_greyc_nislab(in_file, out_file):
    """
    Preprocess the raw GREYC NISLAB dataset
    """
    df = pd.concat([pd.read_excel(in_file, sheetname=0),
                    pd.read_excel(in_file, sheetname=1),
                    pd.read_excel(in_file, sheetname=2),
                    pd.read_excel(in_file, sheetname=3),
                    pd.read_excel(in_file, sheetname=4)])

    df = df[df['Class'] == 2]

    df['age'] = (df['Age'] < 30).map({True: '<30', False: '>=30'})
    df['gender'] = df['Gender'].map({'F': 'female', 'M': 'male'})
    df['handedness'] = df['Handedness'].map({'L': 'left', 'R': 'right'})
    df['session'] = np.arange(len(df))

    df['password'] = df['Password'].map({
        'leonardo dicaprio': 1,
        'the rolling stones': 2,
        'michael schumacher': 3,
        'red hot chilli peppers': 4,
        'united states of america': 5,
    })

    def preprocess_row(idx_row):
        idx, row = idx_row
        keyname = list(map(lambda x: 'space' if x == ' ' else x, list(row['Password'])))
        v = np.array(row['Keystroke Template Vector'].strip().split()).astype(int) // 10000

        s = len(keyname) - 1
        pp, rr, pr, rp = [v[s * i:s * (i + 1)] for i in range(4)]

        timepress = np.r_[0, pp].cumsum()

        # Offset the first release time by the duration of the first key
        timerelease = np.r_[rp[0] - rr[0], rr].cumsum()

        # There are ~180 rows where timerelease == timepress.
        # Fix these by assuming at least the minimum standard clock resolution
        timerelease[timerelease == timepress] += 16
        sample = pd.DataFrame.from_items([
            ('user', row['User_ID']),
            ('session', row['session']),
            ('password', row['password']),
            ('age', row['age']),
            ('gender', row['gender']),
            ('handedness', row['handedness']),
            ('timepress', timepress),
            ('timerelease', timerelease),
            ('keyname', keyname)
        ])

        return sample

    df = pd.concat(map(preprocess_row, df.iterrows()))
    df = df.set_index(['user', 'session'])[COLS]
    df = remove_repeated_keys(df)
    df.to_csv(out_file)
    return
Exemplo n.º 13
0
def order_hist(CreateGroupList,num,f):
    order = pd.read_csv('./B/jdata_user_order.csv', parse_dates=['o_date'])
    sku = pd.read_csv('./B/jdata_sku_basic_info.csv', )
    order = pd.merge(order, sku, on='sku_id', how='left')
    target_order = order[(order.cate == 101) | (order.cate == 30)].reset_index(drop=True)
    first_day = datetime.datetime.strptime('2016-08-31 00:00:00', '%Y-%m-%d %H:%M:%S')
    target_order['o_day_series'] = (target_order['o_date'] - first_day).apply(lambda x: x.days)

    target_order = target_order.sort_values(by=['user_id','o_day_series'], ascending=False).reset_index(drop=True)

    alld = []
    for CG in CreateGroupList:
        CreateGroup = CG
        t = target_order[target_order.o_day_series < CreateGroup]
        features =[]
        for i in range(num):
            t2 = t[['user_id',f]].groupby(['user_id']).shift(-i)
            t2.columns = t2.columns + '_{}'.format(i)
            features.append(t2.columns[0])
            t = pd.concat([t,t2],axis=1)
        x = t.drop_duplicates(subset=['user_id'])
        x = x[['user_id'] + features]
        x['CreateGroup'] = CreateGroup
        alld.append(x)
    df = pd.concat(alld).reset_index(drop=True)
#    print(np.unique(df.CreateGroup))
    return df
Exemplo n.º 14
0
    def test_bollinger(self):
        prices = self.load_pandas('test_bollinger.pkl')
        df = pandas.concat([prices, prices.shift()], axis=1)
        df.columns = ['price', 'price_prev']
        df['sigma'] = prices.std()
        df['mu'] = prices.mean()
        cumul = {'current_scaling': 0.}

        def scale(row, cumul=cumul):
            current_scaling = cumul['current_scaling']
            price = row['price']
            mu = row['mu']
            sigma = 0.8 * row['sigma']
            new_position_scaling = get_position_scaling(price, current_scaling, mu, sigma)
            # updating for next step
            cumul['current_scaling'] = new_position_scaling
            result = {
                'position_scaling': new_position_scaling,
                'band_inf': mu + ((new_position_scaling + 1) * sigma),
                'band_mid': mu + (new_position_scaling * sigma),
                'band_sup': mu + ((new_position_scaling - 1) * sigma)
            }
            return pandas.Series(result)

        df = pandas.concat([df, df.apply(scale, axis=1)], axis=1)
        df_diff = df['position_scaling'] - df['position_scaling'].shift().fillna(0.)
        expected = {Timestamp('2013-02-20 00:00:00'): -2.0, Timestamp('2012-12-06 00:00:00'): 0.0,
                    Timestamp('2012-05-29 00:00:00'): 0.0, Timestamp('2012-01-01 00:00:00'): -3.0,
                    Timestamp('2012-07-07 00:00:00'): 1.0, Timestamp('2012-02-10 00:00:00'): -2.0,
                    Timestamp('2013-01-05 00:00:00'): -1.0, Timestamp('2012-03-03 00:00:00'): -1.0,
                    Timestamp('2013-01-27 00:00:00'): -1.0, Timestamp('2012-04-04 00:00:00'): 0.0,
                    Timestamp('2012-04-18 00:00:00'): 1.0, Timestamp('2013-01-12 00:00:00'): 0.0}
        variations = df_diff[df_diff != 0.].cumsum().to_dict()
        self.assertEqual(expected, variations)
def getAllJunctionSeqs():
    # save junctions : first B1 junctions
    junctionMotifs = []
    junctionSeqs = {}
    flanks = [['G', 'C', 'G', 'C']]

    maxNumSeqs = 12
    for motif in [ '_', 'B1', 'B1,B1', 'B1,B1,B1', 'M', 'M,M', 'M,M,M', 'M,B1', 'M,M,B1', 'M,B1,B1']:
        junctionSeqs[motif] = {}
        
        for flank in flanks:

            baseNum = len(flank)/2
            junctionMotif = ','.join(flank[:baseNum] + [motif] + flank[baseNum:])
    
            junctionSeq = Junction(tuple(junctionMotif.split(','))).sequences
            junctionSeq.loc[:, 'n_flank'] = baseNum
            numSeqs = len(junctionSeq)
            
            # reduce total number of sequences 
            if numSeqs > maxNumSeqs:
                index = np.linspace(0, numSeqs - 1, maxNumSeqs).astype(int)
                
                junctionSeq = junctionSeq.loc[index]
                
            junctionSeqs[motif][''.join(flank)] = junctionSeq
        junctionSeqs[motif] = pd.concat(junctionSeqs[motif], names=['flank', 'junction_num'])
    
    return pd.concat(junctionSeqs, names=['junction'])
Exemplo n.º 16
0
def station_files_to_df(station_path, preamble='d04_text_station', concat_intv=10):
    """
    Reads all the individual station files in directory at station_path and returns them as a single dataframe.
    :param station_path: (str) Path to directory with station data files.
    :param concat_intv: (int) Aggregation interval is the number of files to open and convert to data frame before
    aggregating. It would be fastest to open everything and concat only once. But this could cause memory problems.
    :param preamble: (str) Text that target file names begin with.
    :return: (pd.DataFrame) Dataframe containing all the data from the individual files with a date column appended.
    WARNING: This method only keeps the totals for each station. The lane-level data are thrown away.
    """
    head = ['Timestamp', 'Station', 'District', 'Fwy', 'Dir', 'Type',
            'Length', 'Samples', 'Observed', 'Total_Flow', 'Avg_Occ', 'Avg_Speed'] # Header for output df
    df = pd.DataFrame(columns=head)
    start_dir = os.getcwd()
    os.chdir(station_path)
    fnames = [n for n in os.listdir('.') if n[0:len(preamble)] == preamble]  # List of all file name to read
    temp_list = [df]
    for name in fnames:
        print 'Adding file: ' + name
        temp = pd.read_csv(name, sep=',', header=None).iloc[:, 0:len(head)]
        temp.columns = head
        temp_list.append(temp)
        #TODO cast the Station column to int
        if len(temp_list) == concat_intv:
            temp_list = [pd.concat(temp_list)]
    os.chdir(start_dir)
    return pd.concat(temp_list)
Exemplo n.º 17
0
def get_pnl_stats(df, start_capital, marginrate, freq):
	df['pnl'] = df['pos'].shift(1)*(df['close'] - df['close'].shift(1)).fillna(0.0)
	df['margin'] = pd.concat([df.pos*marginrate[0]*df.close, -df.pos*marginrate[1]*df.close], join='outer', axis=1).max(1)
	if freq == 'm':
		daily_pnl = pd.Series(df['pnl']).resample('1d',how='sum').dropna()
		daily_margin = pd.Series(df['margin']).resample('1d',how='last').dropna()
		daily_cost = pd.Series(df['cost']).resample('1d',how='sum').dropna()
	else:
		daily_pnl = pd.Series(df['pnl'])
		daily_margin = pd.Series(df['margin'])
		daily_cost = pd.Series(df['cost'])
	daily_pnl.name = 'daily_pnl'
	daily_margin.name = 'daily_margin'
	daily_cost.name = 'daily_cost'
	cum_pnl = pd.Series(daily_pnl.cumsum() + daily_cost.cumsum() + start_capital, name = 'cum_pnl')
	available = cum_pnl - daily_margin
	res = {}
	res['avg_pnl'] = daily_pnl.mean()
	res['std_pnl'] = daily_pnl.std()
	res['tot_pnl'] = daily_pnl.sum()
	res['tot_cost'] = daily_cost.sum()
	res['num_days'] = len(daily_pnl)
	res['sharp_ratio'] = res['avg_pnl']/res['std_pnl']*np.sqrt(252.0)
	max_dd, max_dur = max_drawdown(cum_pnl)
	res['max_margin'] = daily_margin.max()
	res['min_avail'] = available.min() 
	res['max_drawdown'] =  max_dd
	res['max_dd_period'] =  max_dur
	if abs(max_dd) > 0:
		res['profit_dd_ratio'] = res['tot_pnl']/abs(max_dd)
	else:
		res['profit_dd_ratio'] = 0
	ts = pd.concat([cum_pnl, daily_margin, daily_cost], join='outer', axis=1)
	return res, ts
Exemplo n.º 18
0
	def get_features(self):
		if not self._features is None:
			return self._features
		
		feats = []
		
		for d in self._data:
			self._prep_data(d)
			c = self._feature_subset.data_subset(d)
			f = pd.concat(map(d._rm_break_info,c), axis=0, ignore_index=True)
			cols_set = set(list(f.columns))
			h = [i for i in d.row_index_header if i in cols_set]
		
			f = f.set_index(h)
			d.prep_once_flag = True
			feats.append(f)
					
		for i,j in izip(feats, feats[1:]):
			assert (i.index == j.index).all()
		features = pd.concat(feats, axis=1, ignore_index=True)
		features = features.fillna(0)
		features = features.astype(float)
	
		assert self._feature_subset.has_allele()
 
		self._features = features.T
		return self._features
Exemplo n.º 19
0
    def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
        # See GH16874, GH18914 and #18686 for why this should be a DataFrame
        from pandas.core.dtypes.common import is_sparse

        frames = [self.dense1, self.dense3]

        sparse_frame = [frames[dense_idx],
                        frames[sparse_idx].to_sparse(fill_value=fill_value)]
        dense_frame = [frames[dense_idx], frames[sparse_idx]]

        # This will try both directions sparse + dense and dense + sparse
        for _ in range(2):
            res = pd.concat(sparse_frame, axis=1)
            exp = pd.concat(dense_frame, axis=1)
            cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)]

            for col in cols:
                exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse")

            for column in frames[dense_idx].columns:
                if dense_idx == sparse_idx:
                    tm.assert_frame_equal(res[column], exp[column])
                else:
                    tm.assert_series_equal(res[column], exp[column])

            tm.assert_frame_equal(res, exp)

            sparse_frame = sparse_frame[::-1]
            dense_frame = dense_frame[::-1]
Exemplo n.º 20
0
 def add_field(self, field):
     """Adds a field object to the universe."""
     self._traits_need_update = True
     if isinstance(field, AtomicField):
         if not hasattr(self, 'field'):
             self.field = field
         else:
             new_field_values = self.field.field_values + field.field_values
             newdx = range(len(self.field), len(self.field) + len(field))
             field.index = newdx
             new_field = pd.concat([self.field, field])
             self.field = AtomicField(new_field, field_values=new_field_values)
     elif isinstance(field, list):
         if not hasattr(self, 'field'):
             fields = pd.concat(field)
             fields.index = range(len(fields))
             fields_values = [j for i in field for j in i.field_values]
             self.field = AtomicField(fields, field_values=fields_values)
         else:
             new_field_values = self.field.field_values + [j for i in field for j in i.field_values]
             newdx = range(len(self.field), len(self.field) + sum([len(i.field_values) for i in field]))
             for i, idx in enumerate(newdx):
                 field[i].index = [idx]
             new_field = pd.concat([self.field] + field)
             self.field = AtomicField(new_field, field_values=new_field_values)
     else:
         raise TypeError('field must be an instance of exatomic.field.AtomicField or a list of them')
     self._traits_need_update = True
Exemplo n.º 21
0
def concat(*universes, name=None, description=None, meta=None):
    """
    Warning:
        This function is not fully featured or tested yet!
    """
    raise NotImplementedError()
    kwargs = {'name': name, 'description': description, 'meta': meta}
    names = []
    for universe in universes:
        for key, data in universe._data().items():
            name = key[1:] if key.startswith('_') else key
            names.append(name)
            if name in kwargs:
                kwargs[name].append(data)
            else:
                kwargs[name] = [data]
    for name in set(names):
        cls = kwargs[name][0].__class__
        if isinstance(kwargs[name][0], Field):
            data = pd.concat(kwargs[name])
            values = [v for field in kwargs[name] for v in field.field_values]
            kwargs[name] = cls(data, field_values=values)
        else:
            kwargs[name] = cls(pd.concat(kwargs[name]))
    return Universe(**kwargs)
Exemplo n.º 22
0
def generate_dataset(pathway):

	pathway_id, pathway_genes = pathway

	POSITIVE_SAMPLES = 100
	NEGATIVE_SAMPLES = 100

	ovarian = pd.read_csv('../data_preparation/ovarian_inbiomap_exp.tsv', index_col=0)

	means = ovarian.mean(axis=0)
	covariances = ovarian.cov()
	variances = ovarian.var()

	print('here')

	new_pathway_means = pd.Series(np.random.normal(0,variances), index=variances.index)[pathway_genes].fillna(0)
	new_means = pd.concat([means, new_pathway_means], axis=1).fillna(0).sum(axis=1).reindex(means.index)

	positives = pd.DataFrame(np.random.multivariate_normal(new_means, covariances, size=POSITIVE_SAMPLES))
	positives.index = [pathway_id+' positive']*len(positives)

	negatives = pd.DataFrame(np.random.multivariate_normal(means, covariances, size=NEGATIVE_SAMPLES))
	negatives.index = [pathway_id+' negative']*len(negatives)

	dataset = pd.concat([positives, negatives]).sample(frac=1)  # shuffle
	dataset.columns = ovarian.columns

	filename = 'synthetic_'+pathway_id+'_'+str(POSITIVE_SAMPLES)+'pos_'+str(NEGATIVE_SAMPLES)+'neg.csv'
	return dataset.to_csv(filename, index=True, header=True)
def build_totals():
    h5_name = "../amounts.h5"
    store = HDFStore(h5_name)

    files = ['logement_tous_regime', 'pfam_tous_regimes',
             'minima_sociaux_tous_regimes', 'IRPP_PPE', 'cotisations_TousRegimes' ]

    first = True
    for xlsfile in files:
        xls = ExcelFile(xlsfile + '.xlsx')
        print xls.path_or_buf
        df_a = xls.parse('amounts', na_values=['NA'])
        try:
            df_b   = xls.parse('benef', na_values=['NA'])
        except:
            df_b = DataFrame()

        if first:
            amounts_df = df_a
            benef_df =  df_b
            first = False
        else:
            amounts_df = concat([amounts_df, df_a])
            benef_df =  concat([benef_df, df_b])

    amounts_df, benef_df = amounts_df.set_index("var"), benef_df.set_index("var")
    print amounts_df.to_string()
    print benef_df.to_string()
    store['amounts'] = amounts_df
    store['benef']   = benef_df
    store.close
 def bollinger_band(self, tick, window=20, k=2, nml=False, mi_only=False):
     """
     Return four arrays for Bollinger Band.
     The first one is the moving average.
     The second one is the upper band.
     The thrid one is the lower band.
     The fourth one is the Bollinger value.
     If mi_only, then return the moving average only.
     """
     ldt_timestamps = self.index
     dt_timeofday = dt.timedelta(hours=16)
     days_delta = dt.timedelta(days=(np.ceil(window*7/5)+5))
     dt_start = ldt_timestamps[0] - days_delta
     dt_end = ldt_timestamps[0] - dt.timedelta(days=1)
     pre_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
     # ldf_data has the data prior to our current interest.
     # This is used to calculate moving average for the first window.
     ldf_data = ut.get_tickdata([tick], pre_timestamps)
     if nml:
         ma_data = pd.concat([ldf_data[tick]['nml_close'], self['nml_close']]) 
     else:
         ma_data = pd.concat([ldf_data[tick]['close'], self['close']])
     bo = dict()
     bo['mi'] = pd.rolling_mean(ma_data, window=window)[ldt_timestamps] 
     if mi_only:
         return bo['mi']
     else:
         sigma = pd.rolling_std(ma_data, window=window)
         bo['up'] = bo['mi'] + k * sigma[ldt_timestamps] 
         bo['lo'] = bo['mi'] - k * sigma[ldt_timestamps] 
         bo['ba'] = (ma_data[ldt_timestamps] - bo['mi']) / (k * sigma[ldt_timestamps])
         return bo
Exemplo n.º 25
0
def find_steady_states_transients(metergroup, columns, noise_level,
                                  state_threshold, **load_kwargs):
    """
    Returns
    -------
    steady_states, transients : pd.DataFrame
    """
    steady_states_list = []
    transients_list = []

    for power_df in metergroup.load(columns=columns, **load_kwargs):
        """
        if len(power_df.columns) <= 2:
            # Use whatever is available
            power_dataframe = power_df
        else:
            # Active, reactive and apparent are available
            power_dataframe = power_df[[('power', 'active'), ('power', 'reactive')]]
        """
        power_dataframe = power_df.dropna()
        if power_dataframe.empty: 
            continue

        x, y = find_steady_states(
            power_dataframe, noise_level=noise_level,
            state_threshold=state_threshold)
        steady_states_list.append(x)
        transients_list.append(y)
    return [pd.concat(steady_states_list), pd.concat(transients_list)]
Exemplo n.º 26
0
def boll(df,periods=20,boll_std=2,column=None,include=True,str='{name}({column},{period})',detail=False,**boll_kwargs):
	def _boll(df,periods,column):
		study='BOLL'
		df,_df,column=validate(df,column)

		## === talib ==== 
		# upper,middle,lower=talib.BBANDS(df[column].values,periods,boll_std,boll_std)
		# _df=pd.DataFrame({'SMA':middle,'UPPER':upper,'LOWER':lower},index=df.index)
		## === /talib ==== 

		## === pure python ==== 
		_df['SMA']=df[column].rolling(window=periods).mean()
		_df['UPPER']=_df['SMA']+df[column].rolling(window=periods).std()*boll_std
		_df['LOWER']=_df['SMA']-df[column].rolling(window=periods).std()*boll_std
		## === /pure python ==== 

		return rename(df,_df,study,periods,column,False,str,detail,output=output)
	column=make_list(column)
	periods=make_list(periods)
	output=['SMA','UPPER','LOWER']
	__df=pd.concat([_boll(df,column=x,periods=y) for y in periods for x in column],axis=1)
	if include:
		return pd.concat([df,__df],axis=1)
	else:
		return __df
Exemplo n.º 27
0
 def __get_freq_vdj(self, data_type, prob = False):
     __sep = '__________'
     sample_freqs = []
     sample_names = []
     for sample in self.samples:
         freq = sample.get_summary(data_type, prob = prob)
         freqval = pd.Series(freq.frequency)
         freqval.index = freq.v.replace(np.nan, 'NA') + __sep + \
                         freq.d.replace(np.nan, 'NA') + __sep + \
                         freq.j.replace(np.nan, 'NA')
         sample_freqs.append(freqval)
         sample_names.append(sample.name)
     freq_dataframe = pd.concat(sample_freqs, axis = 1)
     freq_dataframe.columns = sample_names
     
     vdj_v = []
     vdj_d = []
     vdj_j = []
     for vdj_combination in freq_dataframe.index:
         vv, dd, jj = vdj_combination.split(__sep)
         vdj_v.append(vv)
         vdj_d.append(dd)
         vdj_j.append(jj)
     
     freq_vdjcmbn = pd.concat([pd.Series(vdj_v), pd.Series(vdj_d), pd.Series(vdj_j)], axis = 1).replace('NA', np.nan)
     freq_vdjcmbn.columns = ['V', 'D', 'J']
     
     freq_vdjcmbn.reset_index(drop = True, inplace = True)
     freq_dataframe.reset_index(drop = True, inplace = True)
     
     freq = pd.concat([freq_vdjcmbn, freq_dataframe], axis = 1)
     return freq
Exemplo n.º 28
0
def submit_partial_merge(base, folder, all_blended=False):
  root_path = '/home/workspace/checkins'
  folder = "%s/submit/%s" % (root_path, folder)
  stamp = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
  output = "%s/submit/treva_overwrite_%s_all_blended_%s.csv" % (root_path, stamp, all_blended)

  if all_blended:
    tfiles = [f for f in listdir(folder) if 'blend' in f]
  else:
    tfiles = [f for f in listdir(folder) if 'blend' not in f]

  # # remove old batch
  # print("tfiles before removing old batch: %i" % len(tfiles))
  # old_partials = [f for f in listdir(root_path + "/submit/treva_merge")]
  # tfiles = [f for f in tfiles if f not in old_partials]
  # print("tfiles after removing old batch: %i" % len(tfiles))

  # concat and merge
  df_treva = [pd.read_csv("%s/%s" % (folder, f)) for f in tfiles]
  df_treva = pd.concat(df_treva).sort_values(by='row_id')
  df_base = pd.read_csv("%s/data/submits/%s" % (root_path, base))

  df_base = df_base[~df_base.row_id.isin(df_treva.row_id.values)]
  df_overwrite = pd.concat([df_base, df_treva]).sort_values(by='row_id')
  df_overwrite[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output, index=False)
  print("ensure dim:", len(df_treva), len(set(df_treva.row_id.values)), len(set(df_overwrite.row_id.values)))
  print("overwrite output written in %s @ %s" % (output, datetime.now()))
Exemplo n.º 29
0
def correl(df,periods=21,columns=None,include=True,str=None,detail=False,how='value',**correl_kwargs):
	"""
		how : string
			value
			pct_chg
			diff
	"""
	def _correl(df,periods=21,columns=None,include=True,str=None,detail=False,**correl_kwargs):
		study='CORREL'
		df,_df,columns=validate(df,columns)

		_df['CORREL'] = df[columns[0]].rolling(window=periods,**correl_kwargs).corr(df[columns[1]])

		str=str if str else 'CORREL({column1},{column2},{period})'.format(column1=columns[0],column2=columns[1],period=periods)
		return rename(df,_df,study,periods,columns,include,str,detail)
	columns=df.columns if not columns else columns
	if len(columns) != 2: 
		raise StudyError("2 Columns need to be specified for a correlation study")
	periods=make_list(periods)
	if how=='pct_chg':
		df=df[columns].pct_change()
	elif how=='diff':
		df=df[columns].diff()
	__df=pd.concat([_correl(df,columns=columns,periods=y,include=False,str=str) for y in periods],axis=1)
	if include:
		return pd.concat([df,__df],axis=1)
	else:
		return __df
Exemplo n.º 30
0
def get_peaks(sub_gene_df, top_s, max_dist, feature_name):
    """
    For each gene in gene_info get the
    peaks within max_dist in top_s. This 
    is basically reverse engineering to get
    the peak info for each gene that was found 
    to be associated with a peak. 
    The reason for reverse engeneering rather than 
    storing this information when searching for the genes
    for each peak is that we want to use precisely the same
    function to search the genes for the real data and for the 
    permutations.


    Input:
    gene_info ... data frame with index ('chrom','start')
                and columns 'gene_id' and 'end'
    top_s ... series of peak positions with index (chrom, pos)
                and values peak height
    max_dist ... maximum distance between gene and peak
    """
    gene_info = sub_gene_df

    def get_dist(df, gene_pos):
        """
        calculate distance
        """
        s = pd.Series(df.index.droplevel(0).values - gene_pos.ix[df.index[0][0]], index=df.index.droplevel(0).values)
        return s

    tot_gene_peaks_df = pd.DataFrame()
    if not top_s.index.is_monotonic:
        top_s = top_s.sortlevel([0, 1])
    if not gene_info.index.is_monotonic:
        gene_info = gene_info.sort_index()
    for chrom in gene_info.index.droplevel(1).unique():
        loc_top_s = top_s.ix[chrom]
        start = np.searchsorted(loc_top_s.index.values + max_dist, gene_info.ix[chrom].index.values)
        end = np.searchsorted(loc_top_s.index.values - max_dist, gene_info.ix[chrom]["end"].values)
        x = pd.concat(
            [loc_top_s.iloc[st:ed] for st, ed in zip(start, end)], keys=gene_info.ix[chrom][feature_name].values
        )
        x.name = "peak_height"

        dist_start = x.groupby(lambda i: i[0]).apply(
            lambda df: get_dist(df, gene_info.ix[chrom].reset_index().set_index(feature_name)["start"])
        )
        dist_start.name = "dist_start"
        dist_end = x.groupby(lambda i: i[0]).apply(
            lambda df: get_dist(df, gene_info.ix[chrom].set_index(feature_name)["end"])
        )
        dist_end.name = "dist_end"
        gene_peaks_df = pd.concat([x, dist_start, dist_end], axis=1)
        gene_peaks_df.index = pd.MultiIndex.from_arrays(
            [gene_peaks_df.index.droplevel(1), [chrom] * len(x), gene_peaks_df.index.droplevel(0)]
        )
        tot_gene_peaks_df = pd.concat([tot_gene_peaks_df, gene_peaks_df], axis=0)

    tot_gene_peaks_df.index.names = [feature_name, "chrom", "peak_pos"]
    return tot_gene_peaks_df
def call_opt_ideal_maxbudget(option, wage_max, wage, ser_prov, demand, supply, ser_max, row_i, col_j, provider_list, 
                             overhead_work, FTE_time , service_name):
    '''
    core LP to optimize the allocation by wage or priority --- find something that using grid search
    '''
    total_wage = []; total_sutab = []; d=[]; detail_result=[]; 
    v = np.arange(0, 1.01, 0.1); w_weight = None; s= None
    for i in v:
        wi_weight = i; si_weight = 1- i; 
        if( option == 'ideal_staffing'):
            dataset, tt = call_opt_ideal(wi_weight, si_weight, wage, ser_prov, demand, ser_max, row_i, col_j,FTE_time)
        if( option == 'ideal_staffing_current'):
            dataset, tt = call_opt_current(wi_weight, si_weight, wage, ser_prov, demand, supply, ser_max, row_i, col_j,\
                                           FTE_time, overhead_work, provider_list)
        if tt > 0:
            # calculate statistics
            dataset.columns = provider_list['provider_abbr']
            df = dataset.apply(sum, axis = 0)
            doctime = overhead_work.loc[0, provider_list['provider_abbr']  ]
            totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime
            cortime = overhead_work.loc[1, provider_list['provider_abbr']  ]
            totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime
            df = df + totaldoctime + totalcortime
            df = (((df/FTE_time *10)/5).astype(float).round())/2
            total_wage.append( np.round( sum(df*supply['provider_mean_wage']), 0) )
        else: 
            s = 'Excess supply'
    
    if(s == None): 
        if( wage_max < min(total_wage) ):
            s = 'Try higher maximum wage. Available minimum/maximum wage to minimize wage or minimize \
            sutability score is:' +round(min(total_wage)).astype(str)+ ' and '+  round(max(total_wage)).astype(str)
            print(s)
    else: wage_max = 0
    
    if( wage_max >= min(total_wage) ):
        #print( 'Narrow the search.. it takes few seconds')
        mini = min( np.where( np.array(total_wage) < wage_max )[0] )
        if mini == 0: w_weight = 0
        else:
            total_wage = []; sv = np.arange(v[mini]-0.1, v[mini]+0.001, 0.01)
            for i in sv:
                wi_weight = i; si_weight = 1- i; 
                if( option == 'ideal_staffing'):
                    dataset, tt = call_opt_ideal(wi_weight, si_weight, wage, ser_prov, demand, ser_max, row_i, col_j,FTE_time)
                if( option == 'ideal_staffing_current'):
                    dataset, tt = call_opt_current(wi_weight, si_weight, wage, ser_prov, demand, supply, ser_max, row_i, \
                                                   col_j,FTE_time, overhead_work, provider_list)
                if tt > 0:
                    # calculate statistics
                    dataset.columns = provider_list['provider_abbr']
                    df = dataset.apply(sum, axis = 0)
                    doctime = overhead_work.loc[0, provider_list['provider_abbr']  ]
                    totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime
                    cortime = overhead_work.loc[1, provider_list['provider_abbr']  ]
                    totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime
                    df = df + totaldoctime + totalcortime
                    df = (((df/FTE_time *10)/5).astype(float).round())/2
                    total_wage.append( np.round( sum(df*supply['provider_mean_wage']), 0) )
                else: 
                    total_wage.append(0 )
            mini = min( np.where( np.array(total_wage) < wage_max )[0] )
            w_weight = sv[mini]
    
        s_weight = 1-w_weight
        if( option == 'ideal_staffing'):
            dataset, tt = call_opt_ideal(w_weight, s_weight, wage, ser_prov, demand, ser_max,row_i, col_j,FTE_time)
        if( option == 'ideal_staffing_current'):
            dataset, tt = call_opt_current(w_weight, s_weight, wage, ser_prov, demand, supply, ser_max,  row_i, col_j,\
                                           FTE_time, overhead_work, provider_list)
        
        # calculate statistics
        if tt == 0:
            s = 'Can not find optimal allocation. Change input'
        else:
            dataset.columns = provider_list['provider_abbr']
            detail_result = pd.concat([service_name, dataset], axis = 1)
            df = dataset.apply(sum, axis = 0)
            doctime = overhead_work.loc[0, provider_list['provider_abbr']  ]
            totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime
            cortime = overhead_work.loc[1, provider_list['provider_abbr']  ]
            totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime
            df = df + totaldoctime + totalcortime
            df = (((df/FTE_time *10)/5).astype(float).round())/2
            df.columns = 'FTE'
            total_wage = np.round( sum(df*supply['provider_mean_wage']), 0) 
            total_sutab = np.round( sum((dataset * ser_prov).apply(sum, axis = 0))/sum(dataset.apply(sum, axis = 0)), 2)
            ind_wage = np.round( df*supply['provider_mean_wage'], 0) 
            ind_sutab = np.round( (dataset * ser_prov).apply(sum, axis = 0)/dataset.apply(sum, axis = 0) ,2)
            #tmp =  pd.concat([service_name, dataset], axis = 1)
            s = {}
            s = {'total_wage': total_wage, 'total_sutab': total_sutab, 'ind_wage': ind_wage,
                     'ind_sutab': ind_sutab, 'FTE': df, 'detail_f2f_mini': detail_result}
    return s
Exemplo n.º 32
0
    def assign_power_curve(
        self,
        wake_losses_model="wind_farm_efficiency",
        smoothing=False,
        block_width=0.5,
        standard_deviation_method="turbulence_intensity",
        smoothing_order="wind_farm_power_curves",
        turbulence_intensity=None,
        **kwargs,
    ):
        r"""
        Calculates the power curve of a wind farm.

        The wind farm power curve is calculated by aggregating the power curves
        of all wind turbines in the wind farm. Depending on the parameters the
        power curves are smoothed (before or after the aggregation) and/or a
        wind farm efficiency (power efficiency curve or constant efficiency) is
        applied after the aggregation.
        After the calculations the power curve is assigned to the attribute
        :py:attr:`~power_curve`.

        Parameters
        ----------
        wake_losses_model : str
            Defines the method for taking wake losses within the farm into
            consideration. Options: 'wind_farm_efficiency' or None.
            Default: 'wind_farm_efficiency'.
        smoothing : bool
            If True the power curves will be smoothed before or after the
            aggregation of power curves depending on `smoothing_order`.
            Default: False.
        block_width : float
            Width between the wind speeds in the sum of the equation in
            :py:func:`~.power_curves.smooth_power_curve`. Default: 0.5.
        standard_deviation_method : str
            Method for calculating the standard deviation for the Gauss
            distribution. Options: 'turbulence_intensity',
            'Staffell_Pfenninger'. Default: 'turbulence_intensity'.
        smoothing_order : str
            Defines when the smoothing takes place if `smoothing` is True.
            Options: 'turbine_power_curves' (to the single turbine power
            curves), 'wind_farm_power_curves'.
            Default: 'wind_farm_power_curves'.
        turbulence_intensity : float
            Turbulence intensity at hub height of the wind farm for power curve
            smoothing with 'turbulence_intensity' method. Can be calculated
            from `roughness_length` instead. Default: None.
        roughness_length : float (optional)
            Roughness length. If `standard_deviation_method` is
            'turbulence_intensity' and `turbulence_intensity` is not given
            the turbulence intensity is calculated via the roughness length.

        Returns
        -------
        :class:`~.wind_farm.WindFarm`
            self

        """
        # Check if all wind turbines have a power curve as attribute
        for turbine in self.wind_turbine_fleet["wind_turbine"]:
            if turbine.power_curve is None:
                raise ValueError(
                    "For an aggregated wind farm power curve " +
                    "each wind turbine needs a power curve " +
                    "but `power_curve` of '{}' is None.".format(turbine))
        # Initialize data frame for power curve values
        df = pd.DataFrame()
        for ix, row in self.wind_turbine_fleet.iterrows():
            # Check if needed parameters are available and/or assign them
            if smoothing:
                if (standard_deviation_method == "turbulence_intensity"
                        and turbulence_intensity is None):
                    if ("roughness_length" in kwargs
                            and kwargs["roughness_length"] is not None):
                        # Calculate turbulence intensity and write to kwargs
                        turbulence_intensity = tools.estimate_turbulence_intensity(
                            row["wind_turbine"].hub_height,
                            kwargs["roughness_length"],
                        )
                        kwargs["turbulence_intensity"] = turbulence_intensity
                    else:
                        raise ValueError(
                            "`roughness_length` must be defined for using " +
                            "'turbulence_intensity' as " +
                            "`standard_deviation_method` if " +
                            "`turbulence_intensity` is not given")
            # Get original power curve
            power_curve = pd.DataFrame(row["wind_turbine"].power_curve)
            # Editions to the power curves before the summation
            if smoothing and smoothing_order == "turbine_power_curves":
                power_curve = power_curves.smooth_power_curve(
                    power_curve["wind_speed"],
                    power_curve["value"],
                    standard_deviation_method=standard_deviation_method,
                    block_width=block_width,
                    **kwargs,
                )
            else:
                # Add value zero to start and end of curve as otherwise
                # problems can occur during the aggregation
                if power_curve.iloc[0]["wind_speed"] != 0.0:
                    power_curve = pd.concat(
                        [
                            pd.DataFrame(data={
                                "value": [0.0],
                                "wind_speed": [0.0]
                            }),
                            power_curve,
                        ],
                        join="inner",
                    )
                if power_curve.iloc[-1]["value"] != 0.0:
                    power_curve = pd.concat(
                        [
                            power_curve,
                            pd.DataFrame(
                                data={
                                    "wind_speed": [
                                        power_curve["wind_speed"].loc[
                                            power_curve.index[-1]] + 0.5
                                    ],
                                    "value": [0.0],
                                }),
                        ],
                        join="inner",
                    )
            # Add power curves of all turbine types to data frame
            # (multiplied by turbine amount)
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        power_curve.set_index(["wind_speed"]) *
                        row["number_of_turbines"]),
                ],
                axis=1,
            )
        # Aggregate all power curves
        wind_farm_power_curve = pd.DataFrame(
            df.interpolate(method="index").sum(axis=1))
        wind_farm_power_curve.columns = ["value"]
        wind_farm_power_curve.reset_index(inplace=True)
        # Apply power curve smoothing and consideration of wake losses
        # after the summation
        if smoothing and smoothing_order == "wind_farm_power_curves":
            wind_farm_power_curve = power_curves.smooth_power_curve(
                wind_farm_power_curve["wind_speed"],
                wind_farm_power_curve["value"],
                standard_deviation_method=standard_deviation_method,
                block_width=block_width,
                **kwargs,
            )
        if wake_losses_model == "wind_farm_efficiency":
            if self.efficiency is not None:
                wind_farm_power_curve = power_curves.wake_losses_to_power_curve(
                    wind_farm_power_curve["wind_speed"].values,
                    wind_farm_power_curve["value"].values,
                    wind_farm_efficiency=self.efficiency,
                )
            else:
                msg = (
                    "If you use `wake_losses_model` '{model}' your WindFarm "
                    "needs an efficiency but `efficiency` is {eff}. \n\n"
                    "Failing farm:\n {farm}")
                raise ValueError(
                    msg.format(model=wake_losses_model,
                               farm=self,
                               eff=self.efficiency))
        self.power_curve = wind_farm_power_curve
        return self
Exemplo n.º 33
0
	#mengulang index dari tiap baris sampai tiap elemen dari knownForTitles
	idx = name_df.index.repeat(name_df['knownForTitles'].str.len())

	#memecah values dari list di setiap baris dan menggabungkan nya dengan rows lain menjadi dataframe
	df1 = pd.DataFrame({
		x: np.concatenate(name_df[x].values)
	})

	#mengganti index dataframe tersebut dengan idx yang sudah kita define di awal
	df1.index = idx

	#untuk setiap dataframe yang terbentuk, kita menambahkan ke dataframe bucket
	df_uni.append(df1)

#menggabungkan semua dataframe menjadi satu
df_concat = pd.concat(df_uni, axis=1)

#join dengan value dari dataframe yang awal
unnested_df = df_concat.join(name_df.drop(['knownForTitles'], 1), how='left')

#select kolom sesuai dengan dataframe awal
unnested_df = unnested_df[name_df.columns.tolist()]
print(unnested_df)


# # [Mengelompokkan primaryName menjadi list group by knownForTitles](https://academy.dqlab.id/main/projectcode/214/394/1977)

# In[11]:


unnested_drop = unnested_df.drop(['nconst'], axis=1)
Exemplo n.º 34
0
 def get_dtypes(cls, dtypes_ids):
     return (pandas.concat(cls.materialize(dtypes_ids),
                           axis=1).apply(
                               lambda row: find_common_type_cat(row.values),
                               axis=1).squeeze(axis=0))
Exemplo n.º 35
0
print (df_m_narrow.dtypes)


# In[55]:


df_m_narrow['Date'] = pd.to_datetime(df_m_narrow['Date']) 
print (df_m_narrow.dtypes)


# In[201]:


df_m_narrow_dates = df_m_narrow.set_index('Date')
df_m_narrow_dates.head()


# In[193]:


combos = [df_mtd, df_m_narrow_dates] #listing the data sets
combined = pd.concat(combos) #combining the datasets
combined


# In[ ]:




Exemplo n.º 36
0
print(ParentLevel.head(5))

Lunch=pd.get_dummies(df["lunch"],drop_first=True)
print(Lunch.head(5))

TestPreperation=pd.get_dummies(df["test preparation course"],drop_first=True)
print(TestPreperation.head(5))



RaceEthnicity=pd.get_dummies(df['race/ethnicity'],drop_first=True)
print(RaceEthnicity.head(5))

print(df.head(2))

X=pd.concat([Gender,RaceEthnicity,ParentLevel,TestPreperation,Lunch],axis=1)
print(X.head(1))



from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import neighbors
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0,test_size=0.1)
model = neighbors.KNeighborsRegressor()
model.fit(X_train,Y_train)
predictions=model.predict(X_test)

mean_squared_error(predictions,Y_test)

scores = df.loc[:,["math score","reading score","writing score"]]
def plot_supply_demand(current_demand, current_supply):
    p = pd.concat([current_demand, current_supply], axis = 1)
    p.columns = ['current_needs','current_supply']
    p.plot(kind='bar') # supply demand plot
    plt.title('Needs vs. Supply')
    plt.show()
def input_create_future(geo, year,current_year, sut_target, sdoh_score, pop_chronic_trend,  pop_chronic_prev, chron_care_freq, 
             geo_area, service_characteristics, pop_acute_need, population, provider_supply, pop_prev_need , 
             provider_list , encounter_detail, overhead_work):
    yeardiff = int(year) - int(current_year) 
    # every provider should follow the order of provider_list['provider_abbr']
    population = population.loc[ population['pop_geo_area'] == geo, : ]
    population = population[ ['pop_sex','pop_age', year] ] 
    #total_pop = population[year].sum()

    # preventive demand
    prev_ser = encounter_detail.loc[ encounter_detail['encounter_category'] == 'Preventive',:]
    prev_df = pd.merge(prev_ser, service_characteristics,  how='left', \
                      left_on=['svc_category','svc_desc'], right_on = ['svc_category','svc_desc'])
    p_demand = []
    for i in range(len(prev_df)): # demand = rate_per_encounter * freq * n of population * time
        tmpid = prev_df.loc[i,'encounter_type']
        tmp = pop_prev_need[ ['pop_min_age','pop_max_age','pop_sex',tmpid] ]
        freq = tmp[ tmpid ].astype(float)
        s = tmp[ 'pop_min_age'].astype(int); e = tmp['pop_max_age'].astype(int); g = tmp['pop_sex']
        s = s.loc[ tmp[tmpid] > 0 ]; e = e.loc[ tmp[tmpid] > 0 ]; g = g.loc[ tmp[tmpid] > 0 ]# total population need service
        freq = freq.loc[ tmp[tmpid] > 0 ];
        t_demand = 0
        for j in range( sum( tmp[tmpid] > 0 )): # won's have BOTH
            if(g.iloc[j]== 'F'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \
              (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'F'), year ].sum()
            if(g.iloc[j] == 'M'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \
              (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'M'), year ].sum()
            t_demand = t_demand + r*freq.iloc[j]
        p_demand.append(t_demand) # frequency * population 
    f2f1 = prev_df['max_f2f_time']; f2f0 = prev_df['min_f2f_time'] 
    f2f = (f2f1 + f2f0)/5.0*sdoh_score.values
    
    prev_demand = prev_df['rate_per_encounter'] * f2f * p_demand # rate_per_encounter
    prev_service_name  = prev_df[['encounter_category','encounter_type', 'svc_category', 'svc_desc']]
    prev_ser_prv = prev_df[ provider_list['provider_abbr'] ]
    
    # acute demand == assume excel file updated
    acute_ser = encounter_detail.loc[encounter_detail['encounter_category'] == 'Acute',:]
    acute_df = pd.merge(acute_ser, service_characteristics,  how='left', \
                      left_on=['svc_category','svc_desc'], right_on = ['svc_category','svc_desc'])

    a_demand = []
    for i in range(len(acute_df)): # demand = rate_per_encounter * prev * n of population * time
        tmpid = acute_df.loc[i,'encounter_type']
        
        tmp = pop_acute_need[ ['pop_min_age','pop_max_age','pop_sex',tmpid] ]
        prev = tmp[ tmpid ].astype(float)
        s = tmp[ 'pop_min_age'].astype(int); e = tmp['pop_max_age'].astype(int); g = tmp['pop_sex']
        s = s.loc[ tmp[tmpid] > 0 ]; e = e.loc[ tmp[tmpid] > 0 ]; g = g.loc[ tmp[tmpid] > 0 ]# total population need service
        prev = prev.loc[ tmp[tmpid] > 0 ];
        t_demand = 0
        for j in range( sum( tmp[tmpid] > 0 )): # won's have BOTH
            if(g.iloc[j]== 'F'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \
              (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'F'), year ].sum()
            if(g.iloc[j] == 'M'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \
              (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'M'), year ].sum()
            t_demand = t_demand + r *prev.iloc[j]/1000 # proprtion per 1000
        a_demand.append(t_demand) 
        # total demand
    f2f1 = acute_df['max_f2f_time']; f2f0 = acute_df['min_f2f_time']
    f2f = (f2f1 + f2f0)/5.0*sdoh_score.values
    
    acute_demand = acute_df['rate_per_encounter'] * f2f * a_demand
    acute_service_name  = acute_df[['encounter_category','encounter_type', 'svc_category', 'svc_desc']]
    acute_ser_prv = acute_df[ provider_list['provider_abbr'] ]
    
    # chronic demand
    chro_ser = encounter_detail.loc[encounter_detail['encounter_category'] == 'Chronic',:]
    chro_df = pd.merge(chro_ser, service_characteristics,  how='left', \
                      left_on=['svc_category','svc_desc'], right_on = ['svc_category','svc_desc']) 
    # service level
    c_demand = []
    for i in range(len(chro_df)): # demand = rate_per_encounter(prev)* freq * prev*n of population * time
        tmpid = chro_df.loc[i,'encounter_type']
        freq = chron_care_freq.loc[ chron_care_freq[ tmpid ] > 0, ['chron_cond_abbr', tmpid]]
        # disease level
        t_demand = 0; lf = len(freq)
        if( lf  > 0 ):
            for m in range(len(freq)):
                prev_freq = pop_chronic_prev[ freq.iloc[m, 0] ]*freq.iloc[m,1].astype(float)
                prev_freq = prev_freq.values
                prev_freq = np.squeeze(prev_freq)

                tmp1 = pop_chronic_trend[ freq.iloc[m, 0]]
                tmp = pop_chronic_prev[ ['pop_min_age','pop_max_age','pop_sex' ] ]
                s = tmp[ 'pop_min_age'].astype(int); e = tmp['pop_max_age'].astype(int); g = tmp['pop_sex']
                for j in np.where(prev_freq > 0)[0]: 
                    if(g.iloc[j]== 'F'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \
                      (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'F'), year ].sum()
                    if(g.iloc[j] == 'M'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \
                      (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'M'), year ].sum()
                    t_demand = t_demand + r * prev_freq[j]*(1+ tmp1[j])**(yeardiff)/1000
        c_demand.append(t_demand) # population * prev * freq
    # total demand
    f2f1 = chro_df['max_f2f_time']; f2f0 = chro_df['min_f2f_time']
    f2f = (f2f1 + f2f0)/5.0*sdoh_score.values
    
    chronic_demand = chro_df['rate_per_encounter'] * f2f * c_demand
    chronic_service_name  = chro_df[['encounter_category','encounter_type', 'svc_category', 'svc_desc']]
    chronic_ser_prv = chro_df[ provider_list['provider_abbr'] ]
            
    demand = pd.concat( [prev_demand, acute_demand, chronic_demand ] ).reset_index( drop=True )
    demand = demand.to_frame('demand') # demand
    ser_prov = pd.concat( [prev_ser_prv, acute_ser_prv, chronic_ser_prv ] ).reset_index( drop=True )
    service_name = pd.concat( [prev_service_name, acute_service_name, chronic_service_name] ).reset_index( drop=True )
    
    supply = provider_supply.loc[ provider_supply['provider_geo_area'] == geo, : ]
    nprovidernum = supply['provider_num']*(1+supply['provider_growth_trend'])**(yeardiff)
    nproviderwage = supply['provider_mean_wage']*(1+supply['provider_wage_trend'])**(yeardiff)
    supply = pd.concat([ supply['provider_abbr'], nprovidernum, nproviderwage], axis = 1)
    supply.columns = ['provider_abbr','provider_num','provider_mean_wage']
    supply.index = supply['provider_abbr']
    wage = supply['provider_mean_wage']/sum(supply['provider_mean_wage'])

    # sutability get optimized by sut_target
    if( sut_target > 0):
        for col in provider_list['provider_abbr']: 
            ser_prov[col] = ser_prov[col].replace('^\s*$', np.nan, regex=True).astype(float)
            v = 2*sut_target - ser_prov.loc[ser_prov[col] > sut_target, col] 
            ser_prov.loc[ ser_prov[col] > sut_target, col ] =  v 
            ser_prov[col] = 1- ser_prov[col]/sut_target
   
    # need to remove NA
    ser_prov = ser_prov.fillna(1.1) 
    # when licences not allow service, all will get 1.1, 1-top of the licence 0-super easy
    supply = supply.fillna(0)
    overhead_work = overhead_work.fillna(0) 
    wage = wage.fillna(0)
    
    k = (demand==0) | (np.isnan(demand)) 
    p = np.where( ~k )
    ser_prov = ser_prov.iloc[p[0], :].reset_index( drop=True )
    demand = demand.iloc[p[0]].reset_index( drop=True )
    service_name = service_name.iloc[p[0], :].reset_index(drop=True  )
    
    wage = wage.loc[ provider_list['provider_abbr'] ]
    ser_prov = ser_prov[ provider_list['provider_abbr']  ]
    supply = supply.loc[ provider_list['provider_abbr'] ]
    return wage, ser_prov, demand, supply, overhead_work,  provider_list, service_name
Exemplo n.º 39
0
def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True,
                   objective='', metrics='',debug= False,
                   feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ):

    lgb_params = params
    
    train_df = df[df[target].notnull()]
    test_df = df[df[target].isnull()]
    
    # Divide in training/validation and test data
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234)
#    folds = GroupKFold(n_splits=5)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0],11))
    sub_preds = np.zeros((test_df.shape[0],11))
    feature_importance_df = pd.DataFrame()
    feats = predictors
    cv_resul = []
    '''
    perm = [i for i in range(len(train_df))]
    perm = pd.DataFrame(perm)
    perm.columns = ['index_']

    for n_fold in range(5):
        train_idx = np.array(perm[train_df['cv'] != n_fold]['index_'])
        valid_idx = np.array(perm[train_df['cv'] == n_fold]['index_'])
    '''
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])):
        if (USE_KFOLD == False) and (n_fold == 1):
            break
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx]

        train_x = pd.concat([train_x,train_old[feats]])
        train_y = pd.concat([train_y,train_old[target]])

        train_y_t = train_y.values
        valid_y_t = valid_y.values
        print(train_y_t)
        xgtrain = lgb.Dataset(train_x.values, label = train_y_t,
                              feature_name=predictors,
                              categorical_feature=categorical_features
                              )
        xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t,
                              feature_name=predictors,
                              categorical_feature=categorical_features
                              )

        clf = lgb.train(lgb_params, 
                         xgtrain, 
                         valid_sets=[xgvalid],#, xgtrain], 
                         valid_names=['valid'],#,'train'], 
                         num_boost_round=num_boost_round,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose_eval=verbose_eval, 
#                         feval=feval
                         )



        oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits


        gain = clf.feature_importance('gain')
        fold_importance_df = pd.DataFrame({'feature':clf.feature_name(),
                                           'split':clf.feature_importance('split'),
                                           'gain':100*gain/gain.sum(),
                                           'fold':n_fold,                        
                                           }).sort_values('gain',ascending=False)
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        result = evaluate_macroF1_lgb(valid_y, oof_preds[valid_idx])
#        result = clf.best_score['valid']['macro_f1_score']
        print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result))
        cv_resul.append(round(result,5))
        gc.collect()
        
    #score = np.array(cv_resul).mean()
    score = 'model_2'
    if USE_KFOLD:
        #print('Full f1 score %.6f' % score)
        for i in range(11):
            train_df["class_" + str(i)] = oof_preds[:,i]
            test_df["class_" + str(i)] = sub_preds[:,i]
        train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f')
        test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f')   
        oof_preds = [np.argmax(x)for x in oof_preds]
        sub_preds = [np.argmax(x)for x in sub_preds]    
        train_df[target] = oof_preds
        test_df[target] = sub_preds
        print(test_df[target].mean())
        train_df[target] = oof_preds
        train_df[target] = train_df[target].map(label2current_service)
        test_df[target] = sub_preds
        test_df[target] = test_df[target].map(label2current_service)
        print('all_cv', cv_resul)
        train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False)
        test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False)
        print("test_df mean:")
    
    display_importances(feature_importance_df,score)
def resource_allocation(option, sub_option, wage, ser_prov, demand, supply, overhead_work,  provider_list, service_name, 
                        collapse_group, w_weight, s_weight, wage_max, FTE_time):
    # dimension   
    n_ser = len(demand)
    n_provider = len(provider_list)
    col_j = range(n_provider)
    row_i = range(n_ser)
    
    ser_max = pd.DataFrame(index=range(n_ser),columns=provider_list['provider_abbr'])
    for i in range( n_ser ):# service
        for m in provider_list['provider_abbr']:
            max_val  = (ser_prov.loc[i, m] <= 1) * demand.loc[i,'demand'] 
            ser_max.loc[i,m] = max_val 

    #====== optimization 
    total_wage = []; total_sutab = []; detail_result = []; 
    d = pd.DataFrame(index = provider_list['provider_abbr'])
    
    if( (option == 'ideal_staffing') | (option == 'ideal_staffing_current') ):
        if (sub_option == "all_combination" ):
            co = 0; s = {}
            for i in np.arange(0, 1.1, 0.1):
                wi_weight = i; si_weight = 1- i; co = co + 1
                if( option == 'ideal_staffing'):
                    dataset, tt = call_opt_ideal(wi_weight, si_weight, wage, ser_prov, demand, ser_max, row_i, col_j,FTE_time)
                if( option == 'ideal_staffing_current'):
                    dataset, tt = call_opt_current(wi_weight, si_weight, wage, ser_prov, demand, supply, ser_max, \
                                                   row_i, col_j,FTE_time, overhead_work, provider_list)
                # calculate statistics
                if tt == 0:
                    df = pd.DataFrame(np.nan, index=provider_list['provider_abbr'], columns = [i])
                    d = pd.concat( [ d, df], axis = 1)   
                    total_wage.append( np.nan )
                    total_sutab.append( np.nan )
                else:
                    dataset.columns = provider_list['provider_abbr'] # F2F
                    df = dataset.apply(sum, axis = 0)
                    doctime = overhead_work.loc[0, provider_list['provider_abbr']  ]
                    totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime
                    cortime = overhead_work.loc[1, provider_list['provider_abbr']  ]
                    totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime
                    df = df + totaldoctime + totalcortime
                    df = (((df/FTE_time *10)/5).astype(float).round())/2
                    d = pd.concat( [d, df], axis = 1) 
                    total_wage.append( np.round( sum(df*supply['provider_mean_wage']), 0) )
                    total_sutab.append( sum((dataset * ser_prov).apply(sum, axis = 0))/sum(dataset.apply(sum, axis = 0)) )
                    dataset['weight'] = wi_weight
                    if(co == 1): detail_result =  pd.concat([service_name, dataset], axis = 1)
                    else: 
                        tmp =  pd.concat([service_name, dataset], axis = 1)
                        detail_result = pd.concat([detail_result, tmp], axis = 0)
            d.columns = ['w_0.0','w_0.1','w_0.2','w_0.3','w_0.4','w_0.5','w_0.6','w_0.7','w_0.8','w_0.9', 'w_1.0']
            s = {'total_wage': total_wage, 'total_sutab': total_sutab, 'FTE': d, 'detail_f2f_mini': detail_result}
        
        if( sub_option == "wage_weight"  ) :
            if( option == 'ideal_staffing'):
                dataset, tt = call_opt_ideal(w_weight, s_weight, wage, ser_prov, demand, ser_max,row_i, col_j,FTE_time)
            if( option == 'ideal_staffing_current'):
                dataset, tt = call_opt_current(w_weight, s_weight, wage, ser_prov, demand, supply, ser_max, \
                                               row_i, col_j,FTE_time,overhead_work, provider_list)
            # calculate statistics
            if tt == 0:
                s = 'Can not find optimal allocation. Check input'
            else:
                dataset.columns = provider_list['provider_abbr']
                detail_result = pd.concat([service_name, dataset], axis = 1)
                
                df = dataset.apply(sum, axis = 0)
                doctime = overhead_work.loc[0, provider_list['provider_abbr']  ]
                totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime
                cortime = overhead_work.loc[1, provider_list['provider_abbr']  ]
                totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime
                df = df + totaldoctime + totalcortime
                df = (((df/FTE_time *10)/5).astype(float).round())/2
                df.columns = 'FTE'
                total_wage = np.round( sum(df*supply['provider_mean_wage']), 0) 
                total_sutab = np.round( sum((dataset * ser_prov).apply(sum, axis = 0))/sum(dataset.apply(sum, axis = 0)) ,2)
                # this is the code to get total wage and stability scores of individual provider types
                # if you think individual information is useful, please use similar code for 'all combination' option
                ind_wage = np.round( df*supply['provider_mean_wage'], 0) 
                ind_sutab = np.round( (dataset * ser_prov).apply(sum, axis = 0)/dataset.apply(sum, axis = 0) ,2)
                
                s = {}
                s = {'total_wage': total_wage, 'total_sutab': total_sutab, 'ind_wage': ind_wage,
                     'ind_sutab': ind_sutab, 'FTE': df, 'detail_f2f_mini': detail_result}
                
        if(sub_option ==  "wage_max"):
            s = call_opt_ideal_maxbudget(option, wage_max, wage, ser_prov, demand, supply, ser_max, row_i,\
                                         col_j, provider_list, overhead_work, FTE_time, service_name )
                             
    if( option == 'service_allocation' ):
        #================== get pattern 
        if(collapse_group == True):
            k = service_name['encounter_category']; 
            k1 = service_name['svc_category']
            k2 = k + k1
            p = ser_prov.apply(lambda x: ''.join( ((x <=1 )*1).astype('str') ), axis = 1)
            k2 = k2 + p
            df = pd.concat([k, k1, k2], axis = 1); df.columns = ['d_type','category','comb']
            k1 = df.groupby(["comb"]).size(); n_mem = len(k1) 
        
            # create assignment 
            ser_prov_mem = pd.DataFrame(index=range( len(ser_prov) ),columns=['mem'])
            for i in range( n_mem ):
                ser_prov_mem.loc[ df['comb'] == k1.keys()[i] ] = i        
         
            # total Demand    
            demand_mem = pd.DataFrame(index=range(n_mem),columns=['demand'])
            for k1 in range(n_mem):
                g = demand.loc[ ser_prov_mem['mem'] == k1 , :].apply(sum, axis = 0)
                demand_mem.iloc[k1,:] = g
            
                
            ser_max_mem = pd.DataFrame(index=range(n_mem),columns=provider_list['provider_abbr'])
            for k1 in range(n_mem):
                max_val  = ser_max.loc[ ser_prov_mem['mem'] == k1, : ].apply(sum, axis = 0) 
                ser_max_mem.iloc[k1,:] = max_val
            
            dataset, current_demand = \
            call_assign_service(demand_mem, ser_max_mem, supply, overhead_work, provider_list, FTE_time)
   
            time_allocation = pd.DataFrame(index=range(n_ser),columns=provider_list['provider_abbr'])
            for k1 in range(n_mem):
                tmp =  ser_prov_mem['mem'] == k1; n = sum(tmp)
                if( sum(tmp) == 1 ):
                    time_allocation.loc[np.where(tmp)[0][0],: ] = dataset.iloc[k1,:] 
                else: 
                    i_demand = demand.loc[ np.where(tmp)[0],'demand']; 
                    i_demand = i_demand/sum(i_demand)
                    i = dataset.iloc[k1,:].apply( lambda x: x*i_demand )
                    for j in range(n):
                        time_allocation.loc[ np.where(tmp)[0][j], :] = i.iloc[:,j]  
            dataset = time_allocation
        else: # not collapsing
            dataset, current_demand = \
            call_assign_service(demand, ser_max, supply, overhead_work, provider_list, FTE_time)
            dataset = pd.concat([service_name, dataset], axis = 1)
            
        s = {}
        s = {'FTE': current_demand,  'detail_f2f_mini': dataset}

    return s
Exemplo n.º 41
0
def aggregatelines(network, buses, interlines, line_length_factor=1.0):

    #make sure all lines have same bus ordering
    positive_order = interlines.bus0_s < interlines.bus1_s
    interlines_p = interlines[positive_order]
    interlines_n = interlines[~positive_order].rename(columns={
        "bus0_s": "bus1_s",
        "bus1_s": "bus0_s"
    })
    interlines_c = pd.concat((interlines_p, interlines_n), sort=False)

    attrs = network.components["Line"]["attrs"]
    columns = set(
        attrs.index[attrs.static
                    & attrs.status.str.startswith('Input')]).difference(
                        ('name', 'bus0', 'bus1'))

    consense = {
        attr: _make_consense('Bus', attr)
        for attr in (columns | {'sub_network'} - {
            'r', 'x', 'g', 'b', 'terrain_factor', 's_nom', 's_nom_min',
            's_nom_max', 's_nom_extendable', 'length', 'v_ang_min', 'v_ang_max'
        })
    }

    def aggregatelinegroup(l):

        # l.name is a tuple of the groupby index (bus0_s, bus1_s)
        length_s = haversine_pts(buses.loc[l.name[0], ['x', 'y']],
                                 buses.loc[l.name[1],
                                           ['x', 'y']]) * line_length_factor
        v_nom_s = buses.loc[list(l.name), 'v_nom'].max()

        voltage_factor = (np.asarray(network.buses.loc[l.bus0, 'v_nom']) /
                          v_nom_s)**2
        length_factor = (length_s / l['length'])

        data = dict(r=1. / (voltage_factor / (length_factor * l['r'])).sum(),
                    x=1. / (voltage_factor / (length_factor * l['x'])).sum(),
                    g=(voltage_factor * length_factor * l['g']).sum(),
                    b=(voltage_factor * length_factor * l['b']).sum(),
                    terrain_factor=l['terrain_factor'].mean(),
                    s_nom=l['s_nom'].sum(),
                    s_nom_min=l['s_nom_min'].sum(),
                    s_nom_max=l['s_nom_max'].sum(),
                    s_nom_extendable=l['s_nom_extendable'].any(),
                    num_parallel=l['num_parallel'].sum(),
                    capital_cost=(length_factor * _normed(l['s_nom']) *
                                  l['capital_cost']).sum(),
                    length=length_s,
                    sub_network=consense['sub_network'](l['sub_network']),
                    v_ang_min=l['v_ang_min'].max(),
                    v_ang_max=l['v_ang_max'].min())
        data.update((f, consense[f](l[f])) for f in columns.difference(data))
        return pd.Series(data, index=[f for f in l.columns if f in columns])

    lines = interlines_c.groupby(['bus0_s',
                                  'bus1_s']).apply(aggregatelinegroup)
    lines['name'] = [str(i + 1) for i in range(len(lines))]

    linemap_p = interlines_p.join(lines['name'], on=['bus0_s',
                                                     'bus1_s'])['name']
    linemap_n = interlines_n.join(lines['name'], on=['bus0_s',
                                                     'bus1_s'])['name']
    linemap = pd.concat((linemap_p, linemap_n), sort=False)

    return lines, linemap_p, linemap_n, linemap
Exemplo n.º 42
0
print([(k, list(g)) for k, g in groupby(sorted(lst), key=gb)])
print([(k, list(g)) for k, g in groupby(lst, key=gb)])

list['1234']

[i for i in itertools.chain(str(1234),'fefg')]


s = '3a4b5cdd7e'
print([''.join(list(g)) for k, g in groupby(s, key=lambda x: x.isdigit())])

df = pd.DataFrame()
index = ['alpha', 'beta', 'gamma', 'delta', 'eta']
for i in range(5):
    a = pd.DataFrame([np.linspace(i, 5*i, 5)], index=[index[i]])
    df = pd.concat([df, a], axis=0)
df[1]


tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
tuples[1]
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index.values
a=np.array([1,2])
t=[[a,b] for a,b in tuples]
a,b=tuples[1]
z=[a,y]
list(a)
import pandas as pd
import glob
import numpy as np
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("input_files", help='input_files')
parser.add_argument("output_files",help='output_files')
args=parser.parse_args()
input = args.input_files
output = open(args.output_files,'w')

filez = glob.glob(input + "*.cnt")
print(filez[1])
t1 = pd.read_csv(filez[0], header=0, sep='\t')
tout = t1.iloc[:,0]
for f in filez:
	t1= pd.read_csv(f, header=0, sep='\t')
	tout= pd.concat([tout, t1.iloc[:,6]], axis=1)

tout.to_csv(output)
Exemplo n.º 44
0
def aggregategenerators(network,
                        busmap,
                        with_time=True,
                        carriers=None,
                        custom_strategies=dict()):
    if carriers is None:
        carriers = network.generators.carrier.unique()

    gens_agg_b = network.generators.carrier.isin(carriers)
    attrs = network.components["Generator"]["attrs"]
    generators = (network.generators.loc[gens_agg_b].assign(
        bus=lambda df: df.bus.map(busmap)))
    columns = (set(
        attrs.index[attrs.static & attrs.status.str.startswith('Input')])
               | {'weight'}) & set(generators.columns) - {'control'}
    grouper = [generators.bus, generators.carrier]

    def normed_or_uniform(x):
        return x / x.sum() if x.sum(skipna=False) > 0 else pd.Series(
            1. / len(x), x.index)

    weighting = generators.weight.groupby(grouper,
                                          axis=0).transform(normed_or_uniform)
    generators['capital_cost'] *= weighting
    strategies = {
        'p_nom_max': np.min,
        'weight': np.sum,
        'p_nom': np.sum,
        'capital_cost': np.sum
    }
    strategies.update(custom_strategies)
    if strategies['p_nom_max'] is np.min:
        generators['p_nom_max'] /= weighting

    strategies.update((attr, _make_consense('Generator', attr))
                      for attr in columns.difference(strategies))
    new_df = generators.groupby(grouper, axis=0).agg(strategies)
    new_df.index = _flatten_multiindex(new_df.index).rename("name")

    new_df = pd.concat([
        new_df, network.generators.loc[~gens_agg_b].assign(
            bus=lambda df: df.bus.map(busmap))
    ],
                       axis=0,
                       sort=False)

    new_pnl = dict()
    if with_time:
        for attr, df in iteritems(network.generators_t):
            pnl_gens_agg_b = df.columns.to_series().map(gens_agg_b)
            df_agg = df.loc[:, pnl_gens_agg_b]
            if not df_agg.empty:
                if attr == 'p_max_pu':
                    df_agg = df_agg.multiply(weighting.loc[df_agg.columns],
                                             axis=1)
                pnl_df = df_agg.groupby(grouper, axis=1).sum()
                pnl_df.columns = _flatten_multiindex(
                    pnl_df.columns).rename("name")
                new_pnl[attr] = pd.concat([df.loc[:, ~pnl_gens_agg_b], pnl_df],
                                          axis=1,
                                          sort=False)

    return new_df, new_pnl
Exemplo n.º 45
0
# df_close = pd.DataFrame()
for fid in range(1, len(filename)):

    # print(fid, filename[fid])
    '''
    _df = pd.read_csv(path + filename[fid], index_col='date', parse_dates=True)
    # 如果你想保留第一个aa,那么keep就是first
    _df = _df.reset_index().drop_duplicates(subset='date', keep='first')
    _df['date'] = _df['date'].dt.date
    _df = _df.set_index('date')
    _df.to_csv(drop_duplicate_path + filename[fid])
    '''
    _df = pd.read_csv(drop_duplicate_path + filename[fid], index_col='date', parse_dates=True)

#     print(_df.index.duplicated().sum())
    df_close = pd.concat([df_close, _df.loc[~_df.index.duplicated(), ['close']].rename(
        columns={'close': (filename[fid].split('_')[0])})], join='outer', axis=1, sort=True)
df_close = df_close.fillna(method='bfill')

df_close = log(df_close)

for i in list(range(1650))[::-1]:
    df_close.iloc[i] -= df_close.iloc[i-1]
drop_filenames = os.listdir(drop_duplicate_path)
print(len(drop_filenames))
import pickle

with open('./stock_data/data/relation/ordered_ticker.pkl', 'rb') as f:
    all_stock = pickle.load(f)
df_close = df_close[all_stock]
with open('./stock_data/data/relation/adj_mat.pkl', 'rb') as f:
    all_mat = pickle.load(f)
Exemplo n.º 46
0
    def get_options_data(self, month=None, year=None, expiry=None):
        """
        ***Experimental***
        Gets call/put data for the stock with the expiration data in the
        given month and year

        Parameters
        ----------
        month : number, int, optional(default=None)
            The month the options expire. This should be either 1 or 2
            digits.

        year : number, int, optional(default=None)
            The year the options expire. This should be a 4 digit int.

        expiry : date-like or convertible or list-like object, optional (default=None)
            The date (or dates) when options expire (defaults to current month)

        Returns
        -------
        pandas.DataFrame
            A DataFrame with requested options data.

            Index:
                Strike: Option strike, int
                Expiry: Option expiry, Timestamp
                Type: Call or Put, string
                Symbol: Option symbol as reported on Yahoo, string
            Columns:
                Last: Last option price, float
                Chg: Change from prior day, float
                Bid: Bid price, float
                Ask: Ask price, float
                Vol: Volume traded, int64
                Open_Int: Open interest, int64
                IsNonstandard: True if the the deliverable is not 100 shares, otherwise false
                Underlying: Ticker of the underlying security, string
                Underlying_Price: Price of the underlying security, float64
                Quote_Time: Time of the quote, Timestamp

        Notes
        -----
        Note: Format of returned data frame is dependent on Yahoo and may change.

        When called, this function will add instance variables named
        calls and puts. See the following example:

            >>> aapl = Options('aapl', 'yahoo')  # Create object
            >>> aapl.calls  # will give an AttributeError
            >>> aapl.get_options()  # Get data and set ivars
            >>> aapl.calls  # Doesn't throw AttributeError

        Also note that aapl.calls and appl.puts will always be the calls
        and puts for the next expiry. If the user calls this method with
        a different expiry, the ivar will be named callsYYMMDD or putsYYMMDD,
        where YY, MM and DD are, respectively, two digit representations of
        the year, month and day for the expiry of the options.

        """
        return concat([
            f(month, year, expiry)
            for f in (self.get_put_data, self.get_call_data)
        ]).sortlevel()
Exemplo n.º 47
0
    def filter_processing(self, logical_type, filter):
        logger = logging.getLogger('django')
        df = pd.read_csv(self.open_path)

        # "与"的判断逻辑
        if logical_type == "&":
            for f in filter:
                if f['field_type'] == 0:
                    str_expression = "df['" + f['field_name'] + "']" + f[
                        'filter_method'] + f['filter_obj']
                    logger.debug("LogDebug<"
                                 "str_expression : " + str_expression + ">")
                    df = df[eval(str_expression)]

                elif f['field_type'] == 1 and f['filter_method'] == "contains":
                    df = df[df[f['field_name']].str.contains(f['filter_obj'])]
                elif f['field_type'] == 1 and f[
                        'filter_method'] == "notContains":
                    df = df[~df[f['field_name']].str.contains(f['filter_obj'])]
                elif f['field_type'] == 1 and f['filter_method'] == "notNull":
                    df = df[df[f['field_name']].notnull]
                elif f['field_type'] == 1 and f['filter_method'] == "isNull":
                    df = df[df[f['field_name']].isnull]
            path = self.open_path
            df.to_csv(path, index_label=False, index=0)
            logger.debug("LogDebug<" "logical_type : 与>")
        # "或"的判断逻辑
        elif logical_type == "|":
            df_merger = []
            count = 0
            for f in filter:
                if f['field_type'] == 0:
                    str_expression = "df['" + f['field_name'] + "']" + f[
                        'filter_method'] + f['filter_obj']
                    # df_merger[] = df[eval(str_expression)]
                    df_merger.append(df[eval(str_expression)])
                    count += 1
                elif f['field_type'] == 1 and f['filter_method'] == "contains":
                    df_merger.append(df[df[f['field_name']].str.contains(
                        f['filter_obj'])])
                    count += 1
                elif f['field_type'] == 1 and f[
                        'filter_method'] == "notContains":
                    df_merger.append(
                        df[~df[f['field_name']].str.contains(f['filter_obj'])])
                    count += 1
                elif f['field_type'] == 1 and f['filter_method'] == "isNull":
                    df_merger.append(df[df[f['field_name']].notnull])
                    count += 1
                elif f['field_type'] == 1 and f['filter_method'] == "notNull":
                    df_merger.append(df[df[f['field_name']].isnull])
                    count += 1
            # accumulate,then remove replicated
            i = 0
            dfs = pd.DataFrame(None)
            while i < count:
                dfs = pd.concat(
                    [dfs, df_merger[i]],
                    join='outer',
                    axis=0,
                    ignore_index=True,
                )
                i += 1
            path = self.open_path
            df.to_csv(path, index_label=False, index=0)
Exemplo n.º 48
0
# In[6]:


data_train


# In[7]:


dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix='Cabin')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix='Pclass')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix='Sex')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked')

df = pd.concat([data_train, dummies_Cabin, dummies_Pclass, dummies_Sex, dummies_Embarked], axis=1)
df.drop(['Name', 'Cabin', 'Pclass', 'Sex', 'Embarked', 'Ticket'], axis=1, inplace=True)
df


# In[10]:


import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
scale_param = scaler.fit(df[['Age', 'Fare']])
df['Age_scaled'] = scaler.fit_transform(df[['Age', 'Fare']], scale_param)[:, 0]
df['Fare_scaled'] = scaler.fit_transform(df[['Age', 'Fare']], scale_param)[:, 1]
df

def circos(df = DataFrame([]), label = True, node_color = 'None', column = 1, inter = 25, size = 5, fontsize = 10):
    df1 = df[['GO', 'Entry', 'Term', 'Short_Term']].merge(df, on = 'Entry', how = 'left').drop_duplicates()
    df2 = DataFrame(df[['GO', 'Term', 'Short_Term', 'Entry']].drop_duplicates().groupby(['GO','Term', 'Short_Term']).Entry.count()).reset_index()
    #### >>>>>>>>>>>>>>>>
    #### A partir de una matriz de datos extrae valores no redundantes
    matrix = df1.pivot_table(values='Entry',index=['GO_x', 'Term_x', 'Short_Term_x'],aggfunc=len,columns=['GO_y', 'Term_y', 'Short_Term_y'])
    ###
    df_mat = []
    n = -1
    for i in list(matrix.columns.values):
        n += 1
        new = DataFrame(matrix.iloc[n:len(matrix)][i])
        nn = -1
        for index, row in new.iterrows():
            nn += 1
            df_mat.append([index, i, new.iloc[nn][i]])
        nn = 0
    ###
    df_mat = DataFrame(df_mat, columns = ['go0', 'go1', 'val']).dropna()
    ###
    nodos = []
    for index, row in df_mat.iterrows():
        if row.go0 == row.go1:
            #print(row.go0, row.go1)
            continue
        else:
            #print(row.go0, row.go1)
            nodos.append([row.go0, row.go1, row.val])
    nodos = DataFrame(nodos)
    columnas = {0:'GO', 1:'Term', 2:'Short_Term'}
    nodos = DataFrame([[i[column] for i in nodos[0]], [i[column] for i in nodos[1]], nodos[2]/2]).T
    #### >>>>>>>>>>>>>>>>
    # si interacciona con mas uno, eliminar la redundancia, y si no interacciona con ninguno, dejar el nodo
    # y su valor, este se verá en la red como un nodo aislado
    aislado = [i for i in matrix.columns if len(matrix[[i]].dropna()) == 1]
    aislado = [df_mat[df_mat.go0 == i] for i in aislado]
    if len(aislado) > 0:
        aislado = pd.concat(aislado)
        aislado.columns = [0, 1, 2]
        aislado = DataFrame([[i[column] for i in aislado[0]], [i[column] for i in aislado[1]], aislado[2]/2]).T
        nodos = pd.concat([nodos, aislado])
    else:
        pass
    nodos.columns = ['Source','Target','Weight']
    edges = nodos

    order = []
    for index, row in nodos.iterrows():
        order.append(row[0])
        order.append(row[1])
    orden3 = DataFrame(order).drop_duplicates(keep = 'first').reset_index(drop = True)
    orden3.columns = [columnas[column]]
    nodes = pd.merge(orden3, df2, on = [columnas[column]], how = 'left')
    
    def make_graph(nodes, edges):
        g = nx.Graph()

        for i,row in nodes.iterrows():
            keys = row.index.tolist()
            values = row.values
            # The dict contains all attributes
            g.add_node(row[nodes.columns[0]], **dict(zip(keys,values)))

        for i,row in edges.iterrows():
            keys = row.index.tolist()
            values = row.values
            g.add_edge(row['Source'], row['Target'], **dict(zip(keys,values)))
        return g
    
    g = make_graph(nodes, edges)
    for i,row in nodes.iterrows():
        if row['Entry'] >= inter:
            g.add_node(row[nodes.columns[0]], umbral='up')
        if row['Entry'] < inter:
            g.add_node(row[nodes.columns[0]], umbral='down')
    color_nodo = {'Uniques':nodes.columns[0],
              'Umbral':'umbral',
              'None':False}
    c = nxv.CircosPlot(g,
                   node_color= color_nodo[node_color], # nodes.columns[0],
                   node_grouping= color_nodo[node_color],
                   node_labels=label,
                   node_label_layout='rotation',
                   edge_width= 'Weight',
                   #edge_color = 'umbral',
                   figsize=(size,size),
                   fontsize = fontsize)
    return c.draw()
    ####################
Exemplo n.º 50
0
    for target in targets:
        out.append({'itemSet': D['itemSet'], 'target': target})
    return out


dataPos = pd.DataFrame(
    list(map(lambda x: deleteOne(dataPos.loc[x].to_dict()), dataPos.index)))
dataPos['conversion'] = 1

negativeSampling = 5
dataNeg = list(
    map(lambda x: addOne(data.loc[x].to_dict(), negativeSampling), data.index))
dataNeg = pd.DataFrame(list([item for sublist in dataNeg for item in sublist]))
dataNeg['conversion'] = 0

data = pd.concat([dataPos, dataNeg], ignore_index=True)

setName = 'itemSet'
taskName = 'target'
rewardName = 'conversion'
numItems = nItem
numTasks = numItems
numTraits = 100
lbda = 0.01  # 0.1 -> plafond à 6
alpha = 0.1  # 0.1 mieux ?
eps = 0.001  # 0.01 -> NA
betaMomentum = 0.0  #1 passe à 0 après 150 itérations
numIterFixed = 1800
minibatchSize = 5000  # check 10000
maxIter = 2000  #250
gradient_cap = 1000.0
Exemplo n.º 51
0
        result_i.extend(get_ner(i['原发病灶大小'], 'S'))
        result_i.extend(get_ner(i['转移部位'], 'Z'))
        result_i = [j for j in result_i if j[0] is not None]
        #排序
        result_i = sorted(result_i, key=lambda x: len(x[0]))
        result.append(result_i)
        pass
    return result
    pass


if __name__ == '__main__':
    #读取数据
    dataone = pd.read_excel('./data/onetrain.xlsx')
    datatwo = pd.read_excel('./data/twotrain.xlsx')
    data = pd.concat((dataone, datatwo), axis=0, ignore_index=True)
    result = get_ners_postion(data)
    result = np.array(result)
    np.save('./data/train.npy', result)

    # # 保存
    # text = np.array(text)
    # pos = np.array(pos)
    # np.savez('./data/train.npy',text=text,pos=pos)

    #分析
    # ners = get_ners(data)
    # ners_Y = [j[0] for i in ners for j in i if j[1] == 'Y']
    # ners_Z = [j[0] for i in ners for j in i if j[1] == 'Z']
    # count_y = Counter(ners_Y)
    # count_z = Counter(ners_Z)
Exemplo n.º 52
0
    dictData = {
        'time': times,
        'month': months,
        'sender': senders,
        'recipient': recipients
    }
    dfData = pd.DataFrame(dictData)

#1 Perform Dataframe group by to get the count by Sender & Recipient. Then concat the DataFrame and sort it
dfSender = dfData.groupby('sender').count()[['time']]
dfSender.rename(columns={"time": "cntSender"}, inplace=True)

dfRecipient = dfData.groupby('recipient').count()[['time']]
dfRecipient.rename(columns={"time": "cntRecipient"}, inplace=True)

dfMerged = pd.concat([dfSender, dfRecipient], axis=1, sort=True)
dfMerged.fillna(value={'cntSender': 0, 'cntRecipient': 0}, inplace=True)
dfMerged.sort_values(by=['cntSender', 'cntRecipient'],
                     ascending=False,
                     inplace=True,
                     na_position='last')
dfMerged.to_csv(OUTFILE1)

top = 5
dfHead = dfMerged.head(top)
dfHead.rename(columns={
    "cntSender": "hSender",
    "cntRecipient": "hRecipient"
},
              inplace=True)
lHead = dfHead.index.to_list()
Exemplo n.º 53
0
def roll_up(
    df,
    levels: List[str],
    groupby_vars: List[str],
    extra_groupby_cols: List[str] = None,
    var_name: str = 'type',
    value_name: str = 'value',
    parent_name: str = 'parent',
    agg_func: str = 'sum',
    drop_levels: List[str] = None,
):
    """
    Creates aggregates following a given hierarchy

    ---

    ### Parameters

    *mandatory :*
    - `levels` (*list of str*): name of the columns composing the hierarchy
      (from the top to the bottom level).
    - `groupby_vars` (*list of str*): name of the columns with value to
      aggregate.
    - `extra_groupby_cols` (*list of str*) optional: other columns used to
      group in each level.

    *optional :*
    - `var_name` (*str*) : name of the result variable column.
      By default, `“type”`.
    - `value_name` (*str*): name of the result value column.
      By default, `“value”`.
    - `parent_name` (*str*): name of the result parent column.
      By default, `"parent"`.
    - `agg_func` (*str*): name of the aggregation operation.
      By default, `“sum”`.
    - `drop_levels` (*list of str*): the names of the levels that you may want
      to discard from the output.
    ---

    ### Example

    **Input**

    |    Region |     City |  Population |
    |:---------:|:--------:|:-----------:|
    |       Idf |     Panam|         200 |
    |       Idf |   Antony |          50 |
    |      Nord |    Lille |          20 |

    ```cson
    roll_up:
      levels: ["Region", "City"]
      groupby_vars: "Population"
    ```

    **Output**

    |    Region |     City |  Population |    value |   type |
    |:---------:|:--------:|:-----------:|:--------:|:------:|
    |       Idf |     Panam|         200 |    Panam |   City |
    |       Idf |   Antony |          50 |   Antony |   City |
    |      Nord |    Lille |          20 |    Lille |   City |
    |       Idf |      Nan |         250 |      Idf | Region |
    |      Nord |      Nan |          20 |     Nord | Region |
    """
    dfs = list()
    groupby_cols_cpy = list(levels)
    levels_cpy = list(levels)
    levels_cpy.reverse()

    extra_groupby_cols = extra_groupby_cols or []
    drop_levels = drop_levels or []
    previous_level = None
    for (idx, top_level) in enumerate(levels_cpy):
        # Aggregation
        gb_df = getattr(
            df.groupby(groupby_cols_cpy + extra_groupby_cols)[groupby_vars], agg_func
        )().reset_index()

        # Melt-like columns
        gb_df[var_name] = top_level
        gb_df[value_name] = gb_df[top_level]
        gb_df[parent_name] = gb_df[levels_cpy[idx + 1]] if idx < len(levels_cpy) - 1 else np.NaN
        dfs.append(gb_df)
        if previous_level in drop_levels:
            del dfs[-2]
        previous_level = top_level

        # Remove one level each time in the groupby: lowest level column needs
        # a groupby with every levels, the next level needs every one except
        # the lowest, etc. until the top level column that needs only itself
        # inside the groupby.
        groupby_cols_cpy.pop()
    return pd.concat(dfs, sort=False).reset_index()
def net_plot(df = DataFrame([]), layout = 'Spring', label = 'none', column = 0, label_size = 5,diam_nodos = 10, espe_edges = 0.1,
             inter = 10, color_inter_min = 'k',color_inter_max = 'blue',
             edge_alpha_min = 0.3, edge_alpha_max = 0.3, k_num = 3, color_nodo = 'red', node_alpha = 0.7, backg = 'white',
             label_color = 'black'):
    #
    df1 = df[['GO', 'Entry', 'Term', 'Short_Term']].merge(df, on = 'Entry', how = 'left').drop_duplicates()
    df2 = DataFrame(df[['GO', 'Term', 'Short_Term', 'Entry']].drop_duplicates().groupby(['GO','Term', 'Short_Term']).Entry.count()).reset_index()
    #### >>>>>>>>>>>>>>>>
    #### A partir de una matriz de datos extrae valores no redundantes
    matrix = df1.pivot_table(values='Entry',index=['GO_x', 'Term_x', 'Short_Term_x'],aggfunc=len,columns=['GO_y', 'Term_y', 'Short_Term_y'])
    ###
    df_mat = []
    n = -1
    for i in list(matrix.columns.values):
        n += 1
        new = DataFrame(matrix.iloc[n:len(matrix)][i])
        nn = -1
        for index, row in new.iterrows():
            nn += 1
            df_mat.append([index, i, new.iloc[nn][i]])
        nn = 0
    ###
    df_mat = DataFrame(df_mat, columns = ['go0', 'go1', 'val']).dropna()
    ###
    nodos = []
    for index, row in df_mat.iterrows():
        if row.go0 == row.go1:
            #print(row.go0, row.go1)
            continue
        else:
            #print(row.go0, row.go1)
            nodos.append([row.go0, row.go1, row.val])
    nodos = DataFrame(nodos)
    columnas = {0:'GO', 1:'Term', 2:'Short_Term'}
    nodos = DataFrame([[i[column] for i in nodos[0]], [i[column] for i in nodos[1]], nodos[2]]).T
    #### >>>>>>>>>>>>>>>>
    # si interacciona con mas uno, eliminar la redundancia, y si no interacciona con ninguno, dejar el nodo
    # y su valor, este se verá en la red como un nodo aislado
    aislado = [i for i in matrix.columns if len(matrix[[i]].dropna()) == 1]
    aislado = [df_mat[df_mat.go0 == i] for i in aislado]
    if len(aislado) > 0:
        aislado = pd.concat(aislado)
        aislado.columns = [0, 1, 2]
        aislado = DataFrame([[i[column] for i in aislado[0]], [i[column] for i in aislado[1]], aislado[2]]).T
        nodos = pd.concat([nodos, aislado])
    else:
        pass
    ####################
    # https://networkx.github.io/documentation/networkx-2.3/auto_examples/drawing/plot_weighted_graph.html#sphx-glr-auto-examples-drawing-plot-weighted-graph-py
    G=nx.Graph()
    for index, row in nodos.iterrows():
        G.add_edge(row[0], row[1],weight = row[2])
    elarge=[(u,v,d['weight']) for (u,v,d) in G.edges(data=True) if d['weight'] >= inter]
    esmall=[(u,v,d['weight']) for (u,v,d) in G.edges(data=True) if d['weight'] < inter]
    ###
    layouts = {'Circular':nx.circular_layout,
          'Random':nx.random_layout,
          'Shell':nx.shell_layout,
          'Spectral':nx.spectral_layout,
          'Spring':nx.spring_layout,
          'KK':nx.kamada_kawai_layout}
    #circular_layout
    #random_layout
    #shell_layout
    #spring_layout
    #spectral_layout
    #pos=nx.spring_layout(G, k = k_num) # positions for all nodes
    if layouts[layout] == nx.spring_layout:
        pos=layouts[layout](G, k = k_num)
    else:
        pos=layouts[layout](G)
    #pos=layout(G)
    # nodes
    #------------------------------------------------------------------------------
    # ordenar los valores para representarlos en el tama;o del nodo
    order = []
    for index, row in nodos.iterrows():
        order.append(row[0])
        order.append(row[1])
    orden3 = DataFrame(order).drop_duplicates(keep = 'first').reset_index(drop = True)
    orden3.columns = [columnas[column]]
    orden4 = pd.merge(orden3, df2, on = [columnas[column]], how = 'left')
    #------------------------------------------------------------------------------
    # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.drawing.nx_pylab.draw_networkx_edges.html
    nx.draw_networkx_nodes(G,pos,node_size= np.array(orden4.Entry) * diam_nodos,
                           node_color= color_nodo,alpha= node_alpha)
    # edges
    nx.draw_networkx_edges(G,pos,edgelist=esmall,
                           width = np.array([i[2] for i in esmall]) * espe_edges,
                           alpha= edge_alpha_min,edge_color= color_inter_min,style='-')
    nx.draw_networkx_edges(G,pos, edgelist=elarge,
                           width = np.array([i[2] for i in elarge]) * espe_edges,
                           alpha= edge_alpha_max,edge_color= color_inter_max,style= '-')

    # labels
    posicion = {} ## posicion de las etiquetas, ligeramente arriba
    for key, value in pos.items():
        posicion[key] = value + 0.05
    # arreglo de las posiciones de los nodos en el plano cartesiano
    arr = np.array([[i for i in value] for key, value in pos.items()])
    
    # labels
    if label == 'label':
        nx.draw_networkx_labels(G,posicion,font_size=label_size, font_color=label_color) # ,font_weight='bold'
        if label == 'label':
            plt.axis([arr[:,0].min() - 0.3, arr[:,0].max() + 0.3,
                      arr[:,1].min() - 0.3, arr[:,1].max() + 0.3])
        #plt.axis('off')
        #plt.show() # display
    if label == 'none':
        plt.axis([arr[:,0].min() - 0.2, arr[:,0].max() + 0.2,
                  arr[:,1].min() - 0.2, arr[:,1].max() + 0.2])
        #plt.axis('off')
        #plt.show() # display
    plt.gca().set_facecolor(backg)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().axes.get_xaxis().set_visible(False)
    plt.gca().axes.get_yaxis().set_visible(False)   
filen=filepath+'cv_ovocs_2018_M_Rowlinson.csv'
odf = pd.read_csv(filen, index_col=0)
odf.index = pd.to_datetime(odf.index,format='%d/%m/%Y %H:%M')

cols=list(df) ; ocols = list(odf)
for col in cols:
    try:
        df[col] = df[col].loc[~(df[col] <= 0. )]
    except:
        pass
for col in ocols:
    odf = odf.loc[~(odf[col] <= 0.)]
cols=cols+ocols
hourly=df.resample('H').mean()
ohourly=odf.resample('H').mean()
df=pd.concat([hourly,ohourly], axis=1, sort=False)

cvao = df[cv_spec]['2016']
#cvao = cvao.resample('H').mean()

o3 = np.concatenate(o3,axis=0)
mf = pd.DataFrame(o3[:,0,27,31])
mf.index = cvao.index

m31 = 24*31 ; m30 = 24*30 ; m29 = 24*29 ; m28 = 24*28
mf_djf = pd.concat([mf[-m31:],mf[:m31+m29]])
mf_mam = mf[m31+m29:m31*3+m30+m29]
mf_jja = mf[m31*3+m30+m29:m31*5+m30*2+m29]
mf_son = mf[m31*5+m30*2+m29:m31*6+m30*4+m29]
MF = [mf_djf, mf_mam, mf_jja, mf_son]
cv_djf = pd.concat([cvao[-m31:],cvao[:m31+m29]])
Exemplo n.º 56
0
    li = []

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', -1)

    for filename in all_files:
        dfi = pd.read_csv(filename, index_col=None, header=0)
        dfi = dfi.sort_values('mse')
        selectedi = dfi.head(1)

        li.append(dfi)

    df = pd.concat(li, axis=0, ignore_index=True)

    if args.abc:
        dfa = df[df['formulas'].str.contains("_A")]
        dfb = dfa[dfa['formulas'].str.contains("_B")]
        df = dfb[dfb['formulas'].str.contains("_C")]

    df = df.sort_values('mse')

    selected = df.head(args.n)

    previousvalue = float("inf")
    for f in selected[["formulas", "mse"]].values:
        rmse = math.sqrt(f[1])
        if rmse != previousvalue:
            print(f[0], rmse)
Exemplo n.º 57
0
rollnowise = {
    'quiz_question': questions,
    'option1': optiona,
    'option2': optionb,
    'option3': optionc,
    'option4': optiond,
    'correct_option': correctans,
    'positive marks': answers,
    'negative marks': wrongans,
    'response': response,
}
listing = [correctchoice, wrongchoice, unattempted, totalmrks, totalmarks]

legend = {
    'Legend':
    ['correctchoice', 'wrongchoice', 'unattempted', 'marks', 'fullmarks'],
    'Total': listing
}
dataframe = pd.DataFrame(rollnowise)
dataframe2 = pd.DataFrame(legend)
dataframe3 = pd.concat([dataframe, dataframe2], ignore_index=False, axis=1)
filename = 'individual_responses/' + 'q' + str(quizno) + '_' + str(
    ROLLNO) + ".csv"
dataframe3.to_csv(filename)
# c.execute('SELECT * FROM project1_marks')
# conn.commit()
quizfile = {'Roll': [str(ROLLNO)], 'MARKS': [str(totalmrks)]}
dataset = pd.DataFrame(quizfile)
filename = "quiz_wise_responses/" + 'scores_' + 'q' + quizno + '.csv'
dataset.to_csv(filename, mode='a')
conn.close()
dow_jones = read_data('data/djia.csv')
print("Loaded DJIA", len(dow_jones))

s_p = read_data('data/S&P.csv')
print("Loaded S&P", len(s_p))

russell_2000 = read_data('data/Russell2000.csv')
print("Loaded Russell", len(russell_2000))

nasdaq = read_data('data/nasdaq.csv')
print("Loaded NASDAQ", len(nasdaq))

# combine stock indexes into one dataframe
data = pd.concat(
    [dow_jones['Open'], s_p['Open'], russell_2000['Open'], nasdaq['Open']],
    axis=1,
    keys=['dow_jones', 'S&P', 'russell_2000', 'nasdaq'])
'''
# compare indexes
(data / data.ix[0] * 100).plot(figsize=(12,12))
plt.title("Standarized Indexes 1990-2016")
plt.show()
'''

# predict next year's price
dow_jones['Future'] = dow_jones['Open'].shift(-252)

# drop Nan
dow_jones = dow_jones.dropna()

train = dow_jones.loc[dow_jones.index < '12-31-2015']
Exemplo n.º 59
0
        #test_corpus  = corpus[N_TRAIN:]

        # Write the shuffled corpora to file
        f = open('%s/bigramfree_%05i_corpus.txt' % (OUTPUT_DIR, i + 1), 'w')
        corpus = "\n".join([" ".join(w) for w in corpus])
        f.write(corpus)
        f.close()

        #f = open('%s/bigramfree_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w')
        #corpus = "\n".join([ " ".join(w) for w in test_corpus ])
        #f.write(corpus)
        #f.close()

        return corpus_stats
    return None


corpus_stats = pd.DataFrame()

if True:
    pool = mp.Pool(processes=6)
    results = [
        pool.apply_async(generate_bigram_corpus, args=(i, ))
        for i in range(N_CORPORA)
    ]
    output = [p.get() for p in results]

corpus_stats = pd.concat(output)

corpus_stats.to_csv('interim/bigramgen_free_corpus_stats.csv')
    def extract(self, start_date, end_date, ticker_list):
        '''
        Extract histroical data.

        Args:
            start_date:     The date range(start).
            end_date:       The date range(end).
            ticker_list:    `list` of The target tickers.
        
        Returns:
            `pd.DataFrame`.
        '''
        df_list = [None]
        for i in range(len(ticker_list)):
            df = pd.read_csv(self.__logs_dir + ticker_list[i] + ".csv")
            df["ticker"] = ticker_list[i]
            df_list.append(df)

        result_df = pd.concat(df_list)
        #self.__logger.debug("total: " + str(result_df.shape[0]))

        result_df = result_df[[
            "adjusted_close",
            "close",
            "high",
            "low",
            "open",
            "volume",
            "timestamp",
            "ticker"
        ]]

        result_df = result_df.dropna()
        #self.__logger.debug("After dropping na: " + str(result_df.shape[0]))

        result_df["date"] = result_df["timestamp"]
        try:
            result_df["timestamp"] = result_df.date.apply(self.__get_timestamp)
        except Exception as e:
            print(e)
            print(result_df["date"].drop_duplicates())
            raise

        if start_date is not None:
            start_timestamp = datetime.strptime(start_date, self.__date_format).timestamp()
            result_df = result_df[result_df.timestamp >= start_timestamp]
        if end_date is not None:
            end_timestamp = datetime.strptime(end_date, self.__date_format).timestamp()
            result_df = result_df[result_df.timestamp <= end_timestamp]

        date_df = result_df.sort_values(by=["timestamp"]).drop_duplicates(["date"])
        date_df = date_df.reset_index()
        date_df = pd.concat([
            date_df,
            pd.DataFrame(np.arange(date_df.shape[0]), columns=["date_key"])
        ], axis=1)
        
        result_df = pd.merge(
            result_df,
            date_df[["date", "date_key"]],
            on="date"
        )

        result_df = result_df.sort_values(by=["timestamp", "ticker"])

        return result_df