示例#1
0
def squash_segments(seg_pset):
    """Combine contiguous segments."""
    curr_chrom = None
    curr_start = None
    curr_end = None
    curr_genes = []
    curr_val = None
    curr_cnt = 0
    squashed_rows = []
    for row in seg_pset:
        if row.chromosome == curr_chrom and row.log2 == curr_val:
            # Continue the current segment
            curr_end = row.end
            curr_genes.append(row.gene)
            curr_cnt += 1
        else:
            # Segment break
            # Finish the current segment
            if curr_cnt:
                squashed_rows.append((curr_chrom, curr_start, curr_end,
                                      ",".join(pd.unique(curr_genes)),
                                      curr_val, curr_cnt))
            # Start a new segment
            curr_chrom = row.chromosome
            curr_start = row.start
            curr_end = row.end
            curr_genes = []
            curr_val = row.log2
            curr_cnt = 1
    # Remainder
    squashed_rows.append((curr_chrom, curr_start, curr_end,
                          ",".join(pd.unique(curr_genes)),
                          curr_val, curr_cnt))
    return seg_pset.as_rows(squashed_rows)
def calculate_average_all_pair_distance(csv_file, hasConsensus=True):
    #df_in is the output csv from median_swc plugin
    #it contains unique pair-wise distances
    #consensus is usually the last input the median_swc() inputs, so it won't show up in the "swc_file_name1" column
    #output the average distances array

    #remove invalid results
    df_out=pd.DataFrame()
    if not os.path.exists(csv_file):
        return df_out

    df_f = pd.read_csv(csv_file)
    if df_f.empty:
        return df_out
    df_in = df_f[df_f['sum_distance'] >0]



    df_out = pd.DataFrame(columns = ['swc_file_name','average_sum_distance','average_structure_difference','average_max_distance'])

    dfg1 = df_in.groupby('swc_file_name1')
    dfg2 = df_in.groupby('swc_file_name2')

    swc_names = pd.unique(df_in['swc_file_name1'])
    swc_names_2 = pd.unique(df_in['swc_file_name2'])
    consensus_file_name = df_in['swc_file_name2'].tail(1).values[0]
    if 'consensus' not in consensus_file_name:
        #print  "missing consensus"
        return df_out


    row = 0
    for swc_name in swc_names:
        a = dfg1.get_group(swc_name)
        a = a[a['swc_file_name2']!=consensus_file_name]


        b = pd.DataFrame(columns = ['swc_file_name1','swc_file_name2','sum_distance','structure_difference','max_distance']) #empty
        if swc_name in swc_names_2:
            b = dfg2.get_group(swc_name)


        num_of_swcs = len(a) +len(b)
        df_out.loc[row,'swc_file_name']= swc_name.split('/')[-1]

        df_out.loc[row,'average_sum_distance'] = (a['sum_distance'].sum() + b['sum_distance'].sum())/ num_of_swcs
        df_out.loc[row,'average_structure_difference'] = a['structure_difference'].sum() + b['structure_difference'].sum()/num_of_swcs
        df_out.loc[row,'average_max_distance'] = a['max_distance'].sum() + b['max_distance'].sum()/num_of_swcs

        row = row +1


    df_out.loc[row,'swc_file_name']= consensus_file_name.split('/')[-1]
    consensus_group = dfg2.get_group(consensus_file_name)
    df_out.loc[row,'average_sum_distance'] = consensus_group['sum_distance'].sum() / (num_of_swcs+1)
    df_out.loc[row,'average_structure_difference'] = consensus_group['structure_difference'].sum() / (num_of_swcs+1)
    df_out.loc[row,'average_max_distance'] = consensus_group['max_distance'].sum() / (num_of_swcs+1)


    return df_out
示例#3
0
    def buildTable(self):
        sub_dict = {str(el):np.random.rand(self.origin_call)
            for el in pd.unique(self.df['ORIGIN_CALL'])}
        Data.lookupTable['origin_call'] = sub_dict 

        sub_dict = {str(el):np.random.rand(self.origin_stand)
            for el in pd.unique(self.df['ORIGIN_STAND'])}
        Data.lookupTable['origin_stand'] = sub_dict 

        sub_dict = {str(el):np.random.rand(self.taxi_id)
            for el in pd.unique(self.df['TAXI_ID'])}
        Data.lookupTable['taxi_id'] = sub_dict 

        sub_dict = {str(el):np.random.rand(self.day_type)
            for el in pd.unique(self.df['DAY_TYPE'])}
        Data.lookupTable['day_type'] = sub_dict 

        sub_dict = {str(el):np.random.rand(self.week)
            for el in range(1,54)}
        Data.lookupTable['week_of_year'] = sub_dict 

        sub_dict = {str(el):np.random.rand(self.day)
            for el in range(1,8)}
        Data.lookupTable['day_of_week'] = sub_dict 

        sub_dict = {str(el):np.random.rand(self.qhour)
            for el in range(1,100)}
        Data.lookupTable['qhour_of_day'] = sub_dict 
示例#4
0
文件: platesheets.py 项目: mcvmcv/Poo
def addMarkerSheet(book,marker,results,kea,markerdata,markernotes,colours,imagePath,validated):
	'''Adds a sheet with information about a given marker.'''
	sheet										= book.create_sheet()
	sheet.title									= marker
	writeGroupHeading(sheet,1,'Marker',marker,colours)
	notes										= markernotes.getNotesForMarker(marker)
	writeMarkerNotes(sheet,1,notes,colours)
	m											= 2 + len(notes)
	markerImagePath								= os.path.join(imagePath,marker+'.jpg')
	if os.path.isfile(markerImagePath):
		writeMeltCurves(sheet,m+1,1,markerImagePath)
		m										= m + 19
	groups										= markerdata.getGroupsForMarker(marker)
	
	writeGroupHeading(sheet,m-1,'Results summary','',colours)
	identifiers,groupings						= 'All Samples',['All']
	writeSummaryHeadings(sheet,m,identifiers,groups,colours)
	stats										= results.getStatsTable(marker,groupings)
	grouping									= pd.unique(stats[groupings[-1]].ravel())
	grouping.sort()
	stats										= stats.set_index(grouping)
	for c,g in enumerate(grouping):
		try:
			row									= stats.loc[g]
			writeSummary(sheet,m+c*3+2,'all',row,marker,groups,markerdata,colours)
		except:
			pass
	m											= m+c*3+5
	
	identifiers,groupings						= 'Plate',['Plate ID','Plate Name','Plate Label']
	writeSummaryHeadings(sheet,m,identifiers,groups,colours)
	stats										= results.getStatsTable(marker,groupings)
	grouping									= pd.unique(stats[groupings[-1]].ravel())
	grouping.sort()
	stats										= stats.set_index(grouping)
	for c,g in enumerate(grouping):
		try:
			row									= stats.loc[g]
			writeSummary(sheet,m+c*3+2,False,row,marker,groups,markerdata,colours)
		except:
			pass
	m											= m+c*3+5
	
	if kea:
		identifiers,groupings					= 'Population',['Population']
		writeSummaryHeadings(sheet,m,identifiers,groups,colours)
		stats									= results.getStatsTable(marker,groupings)
		grouping								= pd.unique(stats[groupings[-1]].ravel())
		grouping.sort()
		stats									= stats.set_index(grouping)
		for c,g in enumerate(grouping):
			try:
				row								= stats.loc[g]
				writeSummary(sheet,m+c*3+2,'kea',row,marker,groups,markerdata,colours)
			except:
				pass
		m										= m+c*3+5
		
	writeGroupHeading(sheet,m,'Marker Validations','',colours)
	writeValidated(sheet,m+1,marker,markerdata,colours,validated)
示例#5
0
def label_encode_train_test_sets (train, test) :
	" Label encode 'supplier' and 'bracket_pricing' features for both train and test set "
	test_suppliers = np.sort(pd.unique(test.supplier.ravel()))
	print ("Test suppliers shape & elements: ", test_suppliers.shape, test_suppliers)
	train_suppliers = np.sort(pd.unique(train.supplier.ravel()))
	print ("Train suppliers shape & elements: ", train_suppliers.shape, train_suppliers)
	
	## Merge 'supplier' for both datasets first because we want encoding to be consistent across both
	# http://docs.scipy.org/doc/numpy/reference/generated/numpy.sort.html
	supplier_ids = []
	supplier_ids.extend(train_suppliers)
	supplier_ids.extend(test_suppliers)
	supplier_ids = np.sort(np.unique(supplier_ids))
	print ("Merged supplier_ids.shape: ", supplier_ids.shape)
	# print ("supplier_ids.elements: ", supplier_ids)

	## Perform label encoding fit on the merged array and then individually transform for train and test sets
	print ("Performing label encoding on supplier column...")
	label_e = LabelEncoder()
	label_e.fit(supplier_ids)
	train['supplier'] = label_e.transform(train['supplier'])
	test['supplier'] = label_e.transform(test['supplier'])

	## Perform label encoding on 'bracket_pricing'
	print ("Performing label encoding on bracket_pricing column...")
	train['bracket_pricing'] = label_e.fit_transform(train['bracket_pricing'])
	test['bracket_pricing'] = label_e.fit_transform(test['bracket_pricing'])

	return train, test
def create_table_POP(mimic_db):
	# Limit the population
	POP = mimic_db['ICUSTAY_DETAIL']
	# Criterions 1: first time in ICU, Adult patient, between 12 - 96 hours of ICU stay
	POP = POP[(POP['ICUSTAY_SEQ'] == 1) & (POP['ICUSTAY_AGE_GROUP'] == 'adult') & (POP['ICUSTAY_LOS'] >= 12*60) & (POP['ICUSTAY_LOS'] <= 96*60)]
	# Criterion 2: 1) Exclude CMO, 2) Exclude DNR/DNI, 3) Include only Full Code, 4) No NSICU, CSICU 
	# Merge the patient data with chartevents
	MERGED = POP.merge(mimic_db['CHARTEVENTS'], on='ICUSTAY_ID', how='left')
	# Find PACEMAKER data, Find RISK FOR FALLS data
	PACEMAKER = MERGED[MERGED['ITEMID'] == 1484][['ICUSTAY_ID', 'VALUE1']]
	RISKFALLS = MERGED[MERGED['ITEMID'] == 516][['ICUSTAY_ID', 'VALUE1']]
	PACEMAKER = PACEMAKER.groupby('ICUSTAY_ID', as_index=False).agg(lambda x: x.iloc[0])
	RISKFALLS = RISKFALLS.groupby('ICUSTAY_ID', as_index=False).agg(lambda x: x.iloc[0])
	PACEMAKER.rename(columns={'VALUE1': 'PACEMAKER'}, inplace=True)
	RISKFALLS.rename(columns={'VALUE1': 'RISKFALLS'}, inplace=True)

	all_ICUSTAY_ID = pd.unique(MERGED['ICUSTAY_ID'].values.ravel())
	# Grab out only the events WITHOUT full code for care protocol
	MERGED = MERGED[(MERGED['ITEMID'] == 128) & (MERGED['VALUE1'] != 'Full Code')]
	bad_ICUSTAY_ID = pd.unique(MERGED['ICUSTAY_ID'].values.ravel())
	# Subtract the two sets 
	good_ICUSTAY_ID = np.array([i for i in all_ICUSTAY_ID if i not in bad_ICUSTAY_ID])
	POP = POP[POP['ICUSTAY_ID'].isin(good_ICUSTAY_ID)]
	# Remove any NSICU Service or CSICU Service patients
	POP = POP[~POP['ICUSTAY_FIRST_SERVICE'].isin(['NSICU', 'CSICU'])]
	
	# Merge with the selection data
	POP = POP.merge(PACEMAKER, on='ICUSTAY_ID', how='left')
	POP['PACEMAKER'].fillna('No', inplace=True)
	POP = POP.merge(RISKFALLS, on='ICUSTAY_ID', how='left')
	POP['RISKFALLS'].fillna('None', inplace=True)
	return POP 
示例#7
0
def processVolumeData(aggregated):
	df_2013 = data_utils.parseFileWithIndex('data/2013/Medicare Volume Measures.csv', 
                               ['Diagnosis Related Group', 'Number Of Cases'])
	df_2012 = data_utils.parseFileWithIndex('data/2012/Medicare Payment and Volume Measures.csv', 
                               ['Diagnosis Related Group', 'Number Of Cases'])
    
	mincases = '10'
	missing_marker = '*'
	test_column = 'Chest Pain 2013';
	
	reformatted = []
	for df in [df_2013, df_2012]:
		df['Number Of Cases'][df['Number Of Cases'] == missing_marker] = mincases
		df['Number Of Cases'] = df['Number Of Cases'].str.replace(",", "")
		df['Number Of Cases'] = df['Number Of Cases'].astype(float)
    
		hospitals = pd.unique(df.index)
		cols = pd.unique(df['Diagnosis Related Group'])
    
		df2 = pd.DataFrame(data = 0, index = hospitals, columns = cols)
		for col in cols:
			x = df['Number Of Cases'][df['Diagnosis Related Group'] == col]
			df2[col] = x
		reformatted.append(df2)
		assert 'Number of Cases' not in df2.columns
		assert 'Diagnosis Related Group' not in df2.columns
	
	reformatted[0].columns = reformatted[0].columns.map(lambda x: str(x) + ' 2013')
	volume = reformatted[0].join(reformatted[1], how = 'outer', rsuffix=' 2012')
	assert test_column in volume.columns
	volume[pd.isnull(volume)] = float(mincases) 
	merged_final_data = aggregated.join(volume, how='left')
	merged_final_data = merged_final_data.fillna(float(mincases))
	return merged_final_data
示例#8
0
def get_nutrient_profiles(df):
    '''
    Function to parse the depth-nutrient concentrations from a pandas.core.frame.DataFrame object ('df').
    The df should be the tabular nutrient file imported from the nutrients data file.
    The data will be sorted into an OrderedDict structure with the following key-hierarchy:

        Stations (region specific, i.e. Calvert)
            Nutrients (SiO2, NO2+NO3, PO4)
                Dates Sampled
                    Nutrient concentration (with the sampling depth arranged as the indices)

    The end key-value will be the nutrient concentrations of the respective nutrients sampled 
    (structured as a pandas.core.series.Series object) with the sampling depth as the Series indices.
    '''
    stations_sampled = np.sort(pd.unique(df['Site ID']))
    profiles = OrderedDict()
    nutrients_sampled = ['PO4', 'SiO2', 'NO2+NO3']
    for each_sta in stations_sampled:
        profiles[each_sta] = {}
        for each_nutrient in nutrients_sampled:
            profiles[each_sta][each_nutrient] = {}
            for each_date in pd.unique((df['Site ID'] == each_sta), 'Date'):
                profiles[each_sta][each_nutrient][each_date] = df.loc[(df['Site ID'] == each_sta) & (df['Date'] == each_date), each_nutrient]
                profiles[each_sta][each_nutrient][each_date].index = df.loc[(df['Site ID'] == each_sta) & (df['Date'] == each_date), 'Depth']
    return profiles
示例#9
0
def for_seating(case):
    newframe=pd.DataFrame()                ##  the rearrange of the original data
    subtest=df[df.casenum==case].reset_index(drop=True)  ## 'subtest' only take the records that have a specific case id
    num=subtest.shape[0]                                 ## num will be 3, because usally there are 3 records for each case 
    j1=(pd.unique((subtest.codej1).dropna()))[0]
    j2=(pd.unique((subtest.codej2).dropna()))[0]
    j3=(pd.unique((subtest.codej3).dropna()))[0]
    for j in range(num):
        copytest=deepcopy(subtest.ix[j])
        if copytest.ids==j1:

            newframe=newframe.append(copytest)

        if copytest.ids==j2:
            copytest.codej2=j1
            copytest.j2vote1=copytest.direct1
            copytest.j2maj1=1
            newframe=newframe.append(copytest)

        if copytest.ids==j3:
            copytest.codej3=j1
            copytest.j3vote1=copytest.direct1
            copytest.j3maj1=1
            newframe=newframe.append(copytest)   
    return newframe
示例#10
0
def find_best_pars(df):
    """
    Finds the 'best-fit' parameters for each original file and method
    :param df:
    :return:
    """
    # First, get the maximum value of the ccf
    df['max_ccf'] = df['ccf'].map(np.max)

    methods = pd.unique(df.method)
    original_files = pd.unique(df.original)
    best_info = defaultdict(list)
    for original_filename in original_files:
        for method in methods:
            good = df.loc[(df.method == method) & (df.original == original_filename)]
            best = good.loc[good['max_ccf'] == good['max_ccf'].max()]
            # print 'File: {}\n\tmethod = {}\n\tT = {}\n\tlogg = {}\n\t[Fe/H] = {}'.format(original_filename,
            #                                                                             method,
            #                                                                             best['T'].item(),
            #                                                                             best['logg'].item(),
            #                                                                             best['metallicity'].item())
            #print '\tvsini = {}'.format(best['vsini'].item())
            best_info['original'].append(original_filename)
            best_info['method'].append(method)
            best_info['T'].append(best['T'].item())
            best_info['logg'].append(best['logg'].item())
            best_info['metallicity'].append(best['metallicity'].item())
            best_info['vsini'].append(best['vsini'].item())

    return pd.DataFrame(data=best_info)
示例#11
0
def encodeMut(filepath, output, mut_predictor=None):
    '''Generates a new dataframe with binarized mutation data'''
    df = pd.read_csv(filepath, header = 0)
    
    # Filter rows by SNP column (remove rows where SNP = y)
    df = df.loc[df['SNP'] != 'y', :]
#     # Remove silent mutations
#     df = df.loc[df['Mutation.Description'] != 'Substitution - coding silent']
    # Filter by FATHMM.prediction column
    df = df.loc[df['FATHMM.prediction'] != 'PASSENGER/OTHER', :]
    
    # Filter by VEP predictions (SIFT/Polyphen scores)
    # if mut_predictor is None, does nothing
    if mut_predictor == 'sift':
        df = df.loc[df['SIFT'] != 'tolerated', :]
    elif mut_predictor == 'polyphen':
        df = df.loc[df['PolyPhen'] != 'benign', :]
    # doesn't predict effects of indels!!
    
    # Create new dataframe
    df2 = pd.DataFrame(index=pd.unique(df['cell_line_name']))
    genes = pd.unique(df['Gene.name'])
    for gene in genes:
        df2[gene+'_mut'] = 0
    for index, row in df.iterrows():
        df2.set_value(row['cell_line_name'], row['Gene.name']+'_mut', 1)
    #Save to file
    df2.to_csv(output, index_label='CELL_LINE')
示例#12
0
 def daily_stats(self):
     """Overall right/wrong percent by date"""
     grouped = (self.data_base["correct"]==True).groupby(self.data_base["date"])
     correct = grouped.mean().reset_index()
     correct["wrong"] =  1 - correct["correct"]
     correct["date"] = pd.to_datetime(correct["date"], dayfirst=True)
     correct = correct.sort_values("date")
     fig, ax = plt.subplots()
     left_limit = (datetime.datetime.strptime(pd.unique(self.data_base["date"])[0], '%d-%m-%Y' ) - 
                   datetime.timedelta(days= 1)).date() 
     right_limit = (datetime.datetime.strptime(pd.unique(self.data_base["date"])[-1], '%d-%m-%Y' )  
                    + datetime.timedelta(days=1)).date()
     ax.plot(correct['date'], correct['correct'], marker = '.', color='lightseagreen',
                                        ms = 15, lw = 2, linestyle= '-' , label="correct" )
     ax.plot(correct['date'], correct['wrong'], color='coral', marker = '.',
                                        ms = 15, lw = 2, linestyle= '-', label="wrong"  )
     ax.spines['top'].set_visible(False)
     ax.spines['right'].set_visible(False)
     ax.yaxis.set_ticks_position('left')
     ax.xaxis.set_ticks_position('bottom')
     ax.grid(axis="y", zorder=0, color="#9698A1")
     ax.set_xticks((correct['date'].values))
     ax.set_xlim([left_limit, right_limit])
     ax.set_ylim([0., 1.])
     ax.legend(loc='upper right').get_frame().set_alpha(0.3)
     ax.set_title('Daily Stats', fontsize=15)
     ax.set_xticklabels(correct['date'].map(lambda x: x.strftime("%d %b")))
     ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%d \n %b'))
    def __init__(self,target_label,DF,OUT_PATH,FILENAME,TEST_MODE,logger,eval_output_file,argv):
        self.target_label=target_label
        self.DF=DF
        self.reduced_matrix=np.load(OUT_PATH+FILENAME[:FILENAME.index(".")]+'.npy')
        self.blacklisted_estimators=[]
        self.es=None
        self.best_estimator=None
        self.valid_found=False
        self.test_predictions=None
        self.argv=argv
        if not TEST_MODE:
            self.labelled_indexes=np.array(self.DF[self.target_label+'_num_label'][self.DF[self.target_label+'_num_label'].notnull()].index)
            self.num_classes=dict(sorted(pd.unique(self.DF.loc[self.DF[self.target_label+'_num_label'].notnull(),[self.target_label+'_num_label',target_label]].values),key=lambda x:x[1]))

        else:
            self.labelled_indexes=np.arange(0,500)
            self.num_classes=dict(sorted(pd.unique(self.DF[[self.target_label+'_num_label',target_label]].values),key=lambda x:x[1]))

        logger.info(str(len(self.labelled_indexes))+" labelled instances detected as a SEED for modelling.")
        mask = np.ones(len(self.DF[self.target_label+'_num_label']), dtype=bool)
        mask[self.labelled_indexes] = False

        self.unlabelled_indexes=np.copy(self.DF.index)[mask]
        self.logger=logger
        self._eval_output_file=eval_output_file
示例#14
0
def makehist(series,df,mincount=0,bins=[],title=""):
    rej = df.rejected == 1
    app = df.rejected == 0
    nrej = sum(rej)*1.0
    napp = sum(app)*1.0
    
    series_rej = pd.Series({count: sum(series[rej]==count) for count in pd.unique(series[rej])})
    series_app = pd.Series({count: sum(series[app]==count) for count in pd.unique(series[app])})

    rej_plot = series_rej[series_rej.index>=mincount]/nrej
    app_plot = series_app[series_app.index>=mincount]/napp
    plt.figure()
    if len(bins)>0:
        n1,bin1,_ = plt.hist(np.array(rej_plot.index),bins=bins,weights=np.array(rej_plot),label='rejected')
        n2,bin2,_ = plt.hist(np.array(app_plot.index),bins=bins,weights=np.array(app_plot),label='approved')
    else:
        n1,bin1,_ = plt.hist(np.array(rej_plot.index),weights=np.array(rej_plot),label='rejected')
        n2,bin2,_ = plt.hist(np.array(app_plot.index),weights=np.array(app_plot),label='approved')
    plt.legend()
    if mincount > 0:
        title = title + "(count >= " + str(mincount) + ")"
    plt.title(title)
    plt.show()
    
    df_freq = pd.concat([
                    pd.DataFrame(series_rej/nrej,columns=['rejected']),
                    pd.DataFrame(series_app/napp,columns=['approved'])],axis=1)
    return df_freq
示例#15
0
def app_activity_features():
	train = pd.read_csv("gender_age_train.csv")
	test = pd.read_csv("gender_age_test.csv")
	train.drop(['gender','age','group'],axis=1,inplace=True)
	data = train.append(test)

	""" Merge with brand_model table"""
	device_table = pd.read_csv("phone_brand_device_model.csv")
	data = pd.merge(data,device_table,how='left',on='device_id')
	data = data.drop_duplicates()  #drop duplicates  #note: there is still one device associated with 2 brands/models
	del device_table
	print "data build"
	"""
	Create dataframe indicating for each device id, which app is present, and how much is it active
		- merge events and app_events on event_id
		- group by device_id and app_id, and take the mean of activity
	"""
	events = pd.read_csv("events.csv")
	events = events[events['device_id'].isin(list(data['device_id']))]
	apps = pd.read_csv("app_events.csv")
	apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
	apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
	del events
	print "events build"
	"""Reshape the dataframe so that each app is a new feature"""
	reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
	reshaped[list(pd.unique(apps['app_id']))]=0

	for app in list(pd.unique(apps['app_id'])):
		sliced = apps[apps['app_id']==app]
		reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
	del apps
	return reshaped
示例#16
0
def prepData():
    
    # load up files from disk
    training_data, kaggle_data = LoadData.load_data()    
    features_in = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y']
    

    
    # break dates into month, day, year, day of week, hour 
    # categorize category, month, day, year, dow, hour, district
    # scale lat (y), long(x)
    training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year) 
    training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month)
    training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day)
    training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour)
    training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute)

  
    


    # cast date as unix time
    training_data['UnixTime'] = (pd.DatetimeIndex(training_data['Dates'])).astype(np.int64) / 10000000000

   
    # day of week to number
    sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday')
    def dayOfWeekNumber(d):
        return sorted_days.index(d)
    training_data['DayNumber'] = training_data['DayOfWeek'].apply(dayOfWeekNumber)
    
    
    # set up an id number for each category from alphabetical list
    # add to training_data
    categories = pd.unique(training_data['Category'])
    sorted_categories = (np.sort(categories)).tolist()

    def categoryNumber(category):
        return sorted_categories.index(category)
    training_data['CategoryNumber'] = training_data['Category'].apply(categoryNumber)
    
   
    
    districts = pd.unique(training_data['PdDistrict'])
    sorted_districts = (np.sort(districts)).tolist()
    
    def districtNumber(district):
        return sorted_districts.index(district)
    training_data['DistrictNumber'] = training_data['PdDistrict'].apply(districtNumber)
    
    
    # X is longitude, Y is latitude set ones outside city to median values
    training_data.loc[training_data.X > -122.0, 'X'] = training_data.X.median()
    training_data.loc[training_data.X < -123.0, 'X'] = training_data.X.median()
    training_data.loc[training_data.Y < 37.0, 'Y'] = training_data.Y.median()
    training_data.loc[training_data.Y > 38.0, 'Y'] = training_data.Y.median()

    
    return (training_data)
示例#17
0
def understand_data(input_list):
    for choice in input_list:
        if choice == 1:
            print raw_data.head(10)
            # print pd.unique(raw_data.TripType)
            print "Count of unique product Upc", int(pd.DataFrame(pd.unique(raw_data.Upc)).count())
            print "Count of unique product department descriptions", int(
                pd.DataFrame(pd.unique(raw_data.DepartmentDescription)).count()
            )
        elif choice == 2:
            share_of_trip_type = pd.DataFrame(
                raw_data.groupby(["TripType"], axis=0)["VisitNumber"].count() * 100 / len(raw_data)
            )
            print share_of_trip_type

            products_departments = pd.DataFrame(
                raw_data.groupby(["DepartmentDescription"], axis=0)["Upc"].nunique()
            )  # http://stackoverflow.com/questions/15411158/pandas-countdistinct-equivalent

            print products_departments
        elif choice == 3:
            # http://pandas.pydata.org/pandas-docs/stable/reshaping.html
            # department_triptype_pivot = pd.pivot_table(raw_data, values='VisitNumber', index='DepartmentDescription', columns='TripType', aggfunc=np.size)
            # print department_triptype_pivot

            department_finelinenum_pivot = pd.pivot_table(
                raw_data, values="VisitNumber", index="FinelineNumber", columns="DepartmentDescription", aggfunc=np.size
            )
            print department_finelinenum_pivot

            department_weekday_pivot = pd.pivot_table(
                raw_data, values="VisitNumber", index="DepartmentDescription", columns="Weekday", aggfunc=np.size
            )
            print department_weekday_pivot

            Weekday_trip_type_pivot = pd.pivot_table(
                raw_data, values="VisitNumber", index="TripType", columns="Weekday", aggfunc=np.size
            )
            print Weekday_trip_type_pivot

        elif choice == 10:
            # http://stackoverflow.com/questions/21654635/scatter-plots-in-pandas-pyplot-how-to-plot-by-category
            groups = raw_data.groupby("TripType")
            fig, ax = plt.subplots()
            for name, group in groups:
                # print name
                # print group.DepartmentDescription
                ax.plot(group.ScanCount, group.Weekday_num, marker="o", linestyle="", ms=5, label=name)
                ax.legend()
            plt.show()

        elif choice == 20:
            # (pd.DataFrame(pd.unique(raw_data.TripType))).to_csv('Unique_trip_types.csv',sep = ',',index = False)
            # share_of_trip_type.to_csv('TripType_percentage_share.csv',sep = ',')
            # products_departments.to_csv('Unique products per department.csv',sep = ',')
            # department_triptype_pivot.to_csv('DepartmentDescription trip Type visit number frequency pivot table.csv',sep=',')
            # department_finelinenum_pivot.to_csv('DepartmentDescription finelinenumber visit number frequency pivot table.csv',sep=',')
            # department_weekday_pivot.to_csv('DepartmentDescription Weekday visit number frequency pivot table.csv',sep=',')
            Weekday_trip_type_pivot.to_csv("TripType Weekday visit number frequency pivot table.csv", sep=",")
def get_rand_index(M, n):
    subset = [(randint(0,M.shape[0]-1), randint(0,M.shape[1]-1)) for _ in range(n)]
    subset = pd.unique(subset)
    while len(subset) < n:
        new_indices = [(randint(0,M.shape[0]-1), randint(0,M.shape[1]-1)) for _ in range(n-len(subset))]
        subset = list(subset) + new_indices
        subset = pd.unique(subset)
    return list(subset)
def schedule_to_timeslot(schedule, n_timeslot=15):
    """
    Create personal schedule from list of schedule
    """
    schedule_df = pd.DataFrame(schedule, columns=['person', 'person_to_meet'])
    person_to_meet_df = pd.DataFrame(schedule_df.person_to_meet.values.tolist(), 
                                    columns=range(1, n_timeslot))
    # schedule to dataframe
    schedule_df = pd.concat((schedule_df[['person']], person_to_meet_df), axis=1)

    # create person list and map to row/ column
    person_list = pd.unique(list(schedule_df['person']))
    P_map = {v: k for k, v in enumerate(person_list)}


    timeslot_list = []
    for i in range(1, n_timeslot):
        timeslot_df = schedule_df[['person', i]].dropna().astype(int).reset_index(drop=True)
        P = np.zeros((len(person_list), len(person_list)), dtype=int)
        
        # adding table number
        count = 1
        for _, r in schedule_df.iterrows():
            if not pd.isnull(r['person']) and not pd.isnull(r[i]) and P[P_map[r['person']], P_map[r[i]]] == 0 and P[P_map[r[i]], P_map[r['person']]] == 0:
                P[P_map[r['person']], P_map[r[i]]] = count
                P[P_map[r[i]], P_map[r['person']]] = count
                count += 1
    
        # fill in pair of people (add random pair of people)
        left_person = list(set(person_list) - set(pd.unique(list(timeslot_df.person) + list(timeslot_df[i].dropna().astype(int)))))
        random.shuffle(left_person)

        random_pair = list(zip(left_person[0:int(len(left_person)/2)], left_person[int(len(left_person)/2)::]))
        for p1, p2 in random_pair:
            count += 1
            P[P_map[p1], P_map[p2]] = count
            P[P_map[p2], P_map[p1]] = count
            
        additional_pair = \
            [[p1, p2, int(P[P_map[p1], P_map[p2]])] for p1, p2 in random_pair] + \
            [[p2, p1, int(P[P_map[p1], P_map[p2]])] for p1, p2 in random_pair]
        left_person_df = pd.DataFrame(additional_pair, columns=['person', i, 'table_number'])
        
        # concatenate
        table_number = [int(P[P_map[r['person']], P_map[r[i]]]) for _, r in timeslot_df.iterrows()]
        timeslot_df['table_number'] = table_number
        timeslot_df = pd.concat((timeslot_df, left_person_df))
        timeslot_list.append(timeslot_df)

    # for all person, make schedule
    person_schedule_all = []
    for p in person_list:
        person_schedule = []
        for t_df in timeslot_list:
            person_schedule.append(t_df[t_df.person == p])
        person_schedule_all.append(pd.concat(person_schedule))
    
    return person_schedule_all # list of dataframe each contains schedule
def getASstats():
    print('.... Statistics ....')
    df = pd.read_csv(INPUT_FILE_PATH, parse_dates=[1])
    unique_src_as = len(pd.unique(df.src_ASN.ravel()))
    unique_dst_as = len(pd.unique(df.dst_ASN.ravel()))
    bytes_count = df.Bytes.sum()
    print('''No. of unique:\nSrc ASes: %d,
Dst ASes: %d,
Total Bytes: %d''' % (unique_src_as, unique_dst_as, bytes_count))
def set_unique_tag_values(df):
    unique_tag = set(pd.unique(df['tag1']))
    unique_tag = unique_tag.union(pd.unique(df['tag2']))
    unique_tag = unique_tag.union(pd.unique(df['tag3']))
    unique_tag = unique_tag.union(pd.unique(df['tag4']))
    unique_tag = unique_tag.union(pd.unique(df['tag5']))
    unique_tag = [x for x in unique_tag if str(x) != 'nan']
    global all_unique_tags
    all_unique_tags = all_unique_tags.union(unique_tag)
示例#22
0
def assign_id(data):
    
    items = pd.unique(data['itemid'])
    vks = pd.unique(data['vk'])
    
    itemid = {items[i]:i for i in range(items.shape[0])}
    vkid = {vks[i]:i for i in range(vks.shape[0])}

    return itemid, vkid
示例#23
0
def match_workers_assignments(worker_list, worker_result_df):
    """
    Creates a dataframe with results only from specified workers.
    :param worker_list: workers to filter on
    :param worker_result_df: all worker results
    :return: results filtered by worker
    """
    match_df = worker_result_df[worker_result_df['worker_id'].isin(worker_list)]
    return pd.unique(match_df['assignment_id']).tolist(), pd.unique(match_df['worker_id']).tolist()
示例#24
0
def _write_report(dframe, groups, sub_id=None, sc_split=False, condensed=True,
                  out_file='report.pdf', dpi=DEFAULT_DPI):
    """ Generates the violin plots of each qctype """
    columns = dframe.columns.ravel()
    headers = []
    for group in groups:
        rem = []
        for head in group:
            if head not in columns:
                rem.append(head)
            else:
                headers.append(head)
        for i in rem:
            group.remove(i)

    report = PdfPages(out_file)
    sessions = sorted(pd.unique(dframe.session_id.ravel()))
    for ssid in sessions:
        sesdf = dframe.copy().loc[dframe['session_id'] == ssid]
        scans = pd.unique(sesdf.run_id.ravel())
        if sc_split:
            for scid in scans:
                subset = sesdf.loc[sesdf['run_id'] == scid]
                if len(subset.index) > 1:
                    if sub_id is None:
                        subtitle = '(session: %s other: %s)' % (ssid, scid)
                    else:
                        subtitle = '(Subject: %s, session: %s, other: %s)' % (sub_id, ssid, scid)
                    if condensed:
                        fig = plot_all(sesdf, groups, subject=sub_id,
                                       title='QC measures ' + subtitle)
                    else:
                        fig = plot_measures(
                            sesdf, headers, subject=sub_id,
                            title='QC measures ' + subtitle)
                    report.savefig(fig, dpi=dpi)
                    fig.clf()
        else:
            if len(sesdf.index) > 1:
                if sub_id is None:
                    subtitle = '(session %s)' % (ssid)
                else:
                    subtitle = '(subject %s, session %s)' % (sub_id, ssid)
                if condensed:
                    fig = plot_all(sesdf, groups, subject=sub_id,
                                   title='QC measures ' + subtitle)
                else:
                    fig = plot_measures(
                        sesdf, headers, subject=sub_id,
                        title='QC measures ' + subtitle)
                report.savefig(fig, dpi=dpi)
                fig.clf()

    report.close()
    plt.close()
    # print 'Written report file %s' % out_file
    return out_file
示例#25
0
def choose_share_class(dataframe):
    """not working!
    """
    #sets variables for commonly used column names
    rank = 'Rank'
    fee = 'Management Fee'
    ter = 'Annual Report Net Expense Ratio'
    ongoing_charge = 'Annual Report Ongoing Charge'
    income = 'Distribution Status'
    #finds unique fund indentifiers in rank column
    ranks = pd.unique(dataframe[rank])
    #sets the boolean column to 0
    dataframe['Chosen Share Class'] = 0
    #sets a column to record difference from the target MER of .75
    dataframe['Difference'] = 0
    #loops through the fund identifiers
    for r in ranks:
        print 'Rank - ', r
        fund = dataframe[dataframe[rank]==r].copy()
        income_type = pd.DataFrame(pd.unique(dataframe[income]))
    #sorts out funds without accumulating share classes
        if income_type.isin(['Acc']).sum()[0] == 0:
            fund['Chosen Share Class'] = 'No Acc'
            print 'No Acc'
        else:        
            fund_acc = fund[fund[income]=='Acc'].copy()
            for row in np.arange(len(fund_acc)):
                if fund_acc['Management Fee'].iloc[row,]!="":
                    fund_acc['Difference'].iloc[row,] = np.absolute(\
                        fund_acc['Management Fee'].iloc[row,]-.75)
                    print 'Difference - ',fund_acc['Difference'].iloc[row,]
                else:
                    fund_acc['Chosen Share Class'].iloc[row,] = 'No MER'
                    print 'No MER'
            try:
                minimum = fund_acc['Difference'].min()
                print 'Minimum - ',minimum
                fund_acc['Chosen Share Class'][fund_acc['Difference']==minimum] = 1
                print 'Success'
                if len(fund_acc[fund_acc['Chosen Share Class']==1])>1:
                    acc = fund_acc[fund_acc['Chosen Share Class']==1].copy()
                    for row in np.arange(len(acc)):
                        print 'Row - ', row
                        print 'Name - ', acc['Name'].iloc[row,]
                        print 'MER - ', acc[fee].iloc[row,]
                    result = int(raw_input('Pick one!: '))
                    acc['Chosen Share Class'] = 0
                    acc['Chosen Share Class'].iloc[result,]=1
                    fund_acc[fund_acc['Chosen Share Class']==1] = acc
            except:
                result = 'error'
                fund_acc['Chosen Share Class'] = result
                print result
            fund[fund[income]=='Acc'] = fund_acc
        dataframe[dataframe[rank]==r] = fund
    return dataframe
def _write_report(df, groups, sub_id=None, sc_split=False, condensed=True,
                  out_file='report.pdf'):
    columns = df.columns.ravel()
    headers = []
    for g in groups:
        rem = []
        for h in g:
            if h not in columns:
                rem.append(h)
            else:
                headers.append(h)
        for r in rem:
            g.remove(r)

    report = PdfPages(out_file)
    sessions = sorted(pd.unique(df.session.ravel()))
    for ss in sessions:
        sesdf = df.copy().loc[df['session'] == ss]
        scans = pd.unique(sesdf.scan.ravel())
        if sc_split:
            for sc in scans:
                subset = sesdf.loc[sesdf['scan'] == sc]
                if len(subset.index) > 1:
                    if sub_id is None:
                        subtitle = '(%s_%s)' % (ss, sc)
                    else:
                        subtitle = '(subject %s_%s_%s)' % (sub_id, ss, sc)
                    if condensed:
                        fig = plot_all(sesdf, groups, subject=sub_id,
                                       title='QC measures ' + subtitle)
                    else:
                        fig = plot_measures(
                            sesdf, headers, subject=sub_id,
                            title='QC measures ' + subtitle)
                    report.savefig(fig, dpi=300)
                    fig.clf()
        else:
            if len(sesdf.index) > 1:
                if sub_id is None:
                    subtitle = '(%s)' % (ss)
                else:
                    subtitle = '(subject %s_%s)' % (sub_id, ss)
                if condensed:
                    fig = plot_all(sesdf, groups, subject=sub_id,
                                   title='QC measures ' + subtitle)
                else:
                    fig = plot_measures(
                        sesdf, headers, subject=sub_id,
                        title='QC measures ' + subtitle)
                report.savefig(fig, dpi=300)
                fig.clf()

    report.close()
    plt.close()
    # print 'Written report file %s' % out_file
    return out_file
示例#27
0
    def metric(self, numer, denom, numer_count=False, denom_count=False):
        numer_qty = float(self[numer].sum())
        denom_qty = float(self[denom].sum())

        if numer_count:
            numer_qty = float(len(pd.unique(self[numer])))
        elif denom_count:
            denom_qty = float(len(pd.unique(self[denom])))

        return numer_qty / denom_qty
    def setUp(self):
        import os
        import pandas as pd
        import pkg_resources as p
        from qap.viz.plotting import plot_all
        self.plot_all = plot_all

        anat_spat_csv = \
            p.resource_filename("qap", os.path.join("test_data",
                                                    "qap_anatomical_spatial_5rows.csv"))
        func_spat_csv = \
            p.resource_filename("qap", os.path.join("test_data",
                                                    "qap_functional_spatial_5rows.csv"))
        func_temp_csv = \
            p.resource_filename("qap", os.path.join("test_data",
                                                    "qap_functional_temporal_5rows.csv"))

        self.anat_spat_df = pd.read_csv(anat_spat_csv)
        self.func_spat_df = pd.read_csv(func_spat_csv)
        self.func_temp_df = pd.read_csv(func_temp_csv)

        self.anat_spat_sessions = \
            sorted(pd.unique(self.anat_spat_df.Session.ravel()))
        self.func_spat_sessions = \
            sorted(pd.unique(self.func_spat_df.Session.ravel()))
        self.func_temp_sessions = \
            sorted(pd.unique(self.func_temp_df.Session.ravel()))

        self.anat_spat_groups = [['CNR'],
                                ['Cortical Contrast'],
                                ['EFC'],
                                ['FBER'],
                                ['FWHM', 'FWHM_x', 'FWHM_y', 'FWHM_z'],
                                ['Qi1'],
                                ['SNR']]

        self.func_spat_groups = [['EFC'],
                                ['FBER'],
                                ['FWHM', 'FWHM_x', 'FWHM_y', 'FWHM_z'],
                                ['Ghost_%s' % a for a in ['x', 'y', 'z']],
                                ['SNR']]

        self.func_temp_groups = [['Fraction of Outliers (Mean)',
                                  'Fraction of Outliers (Median)',
                                  'Fraction of Outliers (Std Dev)',
                                  'Fraction of Outliers IQR'],
                                 ['GCOR'],
                                 ['Quality (Mean)', 'Quality (Median)',
                                  'Quality (Std Dev)', 'Quality IQR',
                                  'Quality percent outliers'],
                                 ['RMSD (Mean)', 'RMSD (Median)',
                                  'RMSD (Std Dev)', 'RMSD IQR'],
                                 ['Std. DVARS (Mean)', 'Std. DVARS (Median)',
                                  'Std. DVARS percent outliers',
                                  'Std. DVARs IQR']]
示例#29
0
def homepage():

    # Contenedores locales.
    selected_entity = []
    se_subset = pd.DataFrame()
    asociated_words = pd.DataFrame()
    sources = pd.DataFrame()
    graph_data = []

    # Las lineas que siguen son las acciones del lado del servidor que
    # corren cada vez que el usuario hace click en una entidad.

    if request.method == 'POST':
        selected_entity = request.form.get('entidades', None)
        se_subset = df[df.entidad == selected_entity]
        se_subset = se_subset.sort(['dateStamp'])
        last_week_days = se_subset['dateStamp'].iloc[-7]
        last_week_subset = se_subset[se_subset.dateStamp == last_week_days]


        asociated_words = (last_week_subset[['adjetivo', 'valor']]
                           .groupby('adjetivo').sum()
                           .sort('valor', ascending=False))

        urls = list(pd.unique(last_week_subset.link.ravel()))
        titles = list(pd.unique(last_week_subset.titulo.ravel()))
        sources = pd.DataFrame(urls, titles)

        # Creacion del grafico con Pygal.

        custom_style = Style(background='transparent',
                             plot_background='transparent',
                             title_font_size=32)

        graph = pygal.Line(show_legend=False, x_label_rotation=20, width=1500,
                           height=450, explicit_size=True, range=(-1.2, 1.2),
                           background="transparent", foreground="transparent",
                           plot_background="transparent", margin=0,
                           style=custom_style, show_minor_x_labels = False)

        graph.title = "Sentimiento para '"+selected_entity+"'"
        agg = se_subset.groupby('dateStamp').mean()
        m_avg = pd.rolling_mean(agg, 3)
        m_avg = m_avg.fillna(0)
        # graph.add(selected_entity, list(agg['valor']))
        graph.add(selected_entity, list(m_avg['valor']))
        date = pd.DatetimeIndex(agg.index)
        graph.x_labels = map(str, date)
        graph.x_labels_major = map(str, date[0::5])
        graph_data = graph.render_data_uri()

    return render_template('index.html', entities=entities,
                           graph_data=graph_data,
                           asociated_words=asociated_words,
                           se_subset=se_subset, sources=sources)
示例#30
0
def create_dict_of_team_ids(df):
    both_teams = {}
    for game_id in pd.unique(df['GAME_ID'].values.tolist()):
        df_curr = df[df['GAME_ID'] == game_id]
        curr_teams = pd.unique(df_curr['TEAM_ID'].values)

        if curr_teams.size != 2:
            print "ERROR"
        both_teams[game_id] = curr_teams

    return both_teams
to_date = '2019-09-12'

sites_df = pdsql.mssql.rd_sql(server,
                              database,
                              'TSDataNumericHourlySumm',
                              col_names=['ExtSiteID', 'DatasetTypeID'],
                              where_in={'DatasetTypeID': [38, 15]})

prec_ts_df = pdsql.mssql.rd_sql(server,
                                database,
                                'TSDataNumericHourly',
                                col_names=['ExtSiteID', 'DateTime', 'Value'],
                                where_in={
                                    'DatasetTypeID': [38, 15],
                                    'ExtSiteID':
                                    pd.unique(sites_df.ExtSiteID).tolist(),
                                    'QualityCode': [600]
                                })
prec_ts_df['DateTime'] = pd.to_datetime(prec_ts_df['DateTime'])
prec_ts_df = prec_ts_df.loc[(prec_ts_df.DateTime >= pd.Timestamp(from_date))
                            & (prec_ts_df.DateTime <= pd.Timestamp(to_date))]
prec_ts_df.to_csv(
    r'C:\Active\Projects\MetService_precip_analysis\Data\Stations\station_ts.csv',
    index=False)

#-Get the locations of the sites and write to csv
sites_xy = pdsql.mssql.rd_sql(
    server,
    database,
    'ExternalSite',
    col_names=['ExtSiteID', 'NZTMX', 'NZTMY'],
示例#32
0
def process_scene(ns_scene, env, nusc, data_path):
    scene_id = int(ns_scene['name'].replace('scene-', ''))
    data = pd.DataFrame(columns=['frame_id',
                                 'type',
                                 'node_id',
                                 'robot',
                                 'x', 'y', 'z',
                                 'length',
                                 'width',
                                 'height',
                                 'heading'])

    sample_token = ns_scene['first_sample_token']
    sample = nusc.get('sample', sample_token)
    frame_id = 0
    while sample['next']:
        annotation_tokens = sample['anns']
        for annotation_token in annotation_tokens:
            annotation = nusc.get('sample_annotation', annotation_token)
            category = annotation['category_name']
            if len(annotation['attribute_tokens']):
                attribute = nusc.get('attribute', annotation['attribute_tokens'][0])['name']
            else:
                continue

            if 'pedestrian' in category and not 'stroller' in category and not 'wheelchair' in category:
                our_category = env.NodeType.PEDESTRIAN
            elif 'vehicle' in category and 'bicycle' not in category and 'motorcycle' not in category and 'parked' not in attribute:
                our_category = env.NodeType.VEHICLE
            else:
                continue

            data_point = pd.Series({'frame_id': frame_id,
                                    'type': our_category,
                                    'node_id': annotation['instance_token'],
                                    'robot': False,
                                    'x': annotation['translation'][0],
                                    'y': annotation['translation'][1],
                                    'z': annotation['translation'][2],
                                    'length': annotation['size'][0],
                                    'width': annotation['size'][1],
                                    'height': annotation['size'][2],
                                    'heading': Quaternion(annotation['rotation']).yaw_pitch_roll[0]})
            data = data.append(data_point, ignore_index=True)

        # Ego Vehicle
        our_category = env.NodeType.VEHICLE
        sample_data = nusc.get('sample_data', sample['data']['CAM_FRONT'])
        annotation = nusc.get('ego_pose', sample_data['ego_pose_token'])
        data_point = pd.Series({'frame_id': frame_id,
                                'type': our_category,
                                'node_id': 'ego',
                                'robot': True,
                                'x': annotation['translation'][0],
                                'y': annotation['translation'][1],
                                'z': annotation['translation'][2],
                                'length': 4,
                                'width': 1.7,
                                'height': 1.5,
                                'heading': Quaternion(annotation['rotation']).yaw_pitch_roll[0],
                                'orientation': None})
        data = data.append(data_point, ignore_index=True)

        sample = nusc.get('sample', sample['next'])
        frame_id += 1

    if len(data.index) == 0:
        return None

    data.sort_values('frame_id', inplace=True)
    max_timesteps = data['frame_id'].max()

    x_min = np.round(data['x'].min() - 50)
    x_max = np.round(data['x'].max() + 50)
    y_min = np.round(data['y'].min() - 50)
    y_max = np.round(data['y'].max() + 50)

    data['x'] = data['x'] - x_min
    data['y'] = data['y'] - y_min

    scene = Scene(timesteps=max_timesteps + 1, dt=dt, name=str(scene_id), aug_func=augment)

    # Generate Maps
    map_name = nusc.get('log', ns_scene['log_token'])['location']
    nusc_map = NuScenesMap(dataroot=data_path, map_name=map_name)

    type_map = dict()
    x_size = x_max - x_min
    y_size = y_max - y_min
    patch_box = (x_min + 0.5 * (x_max - x_min), y_min + 0.5 * (y_max - y_min), y_size, x_size)
    patch_angle = 0  # Default orientation where North is up
    canvas_size = (np.round(3 * y_size).astype(int), np.round(3 * x_size).astype(int))
    homography = np.array([[3., 0., 0.], [0., 3., 0.], [0., 0., 3.]])
    layer_names = ['lane', 'road_segment', 'drivable_area', 'road_divider', 'lane_divider', 'stop_line',
                   'ped_crossing', 'stop_line', 'ped_crossing', 'walkway']
    map_mask = (nusc_map.get_map_mask(patch_box, patch_angle, layer_names, canvas_size) * 255.0).astype(
        np.uint8)
    map_mask = np.swapaxes(map_mask, 1, 2)  # x axis comes first
    # PEDESTRIANS
    map_mask_pedestrian = np.stack((map_mask[9], map_mask[8], np.max(map_mask[:3], axis=0)), axis=0)
    type_map['PEDESTRIAN'] = GeometricMap(data=map_mask_pedestrian, homography=homography, description=', '.join(layer_names))
    # VEHICLES
    map_mask_vehicle = np.stack((np.max(map_mask[:3], axis=0), map_mask[3], map_mask[4]), axis=0)
    type_map['VEHICLE'] = GeometricMap(data=map_mask_vehicle, homography=homography, description=', '.join(layer_names))

    map_mask_plot = np.stack(((np.max(map_mask[:3], axis=0) - (map_mask[3] + 0.5 * map_mask[4]).clip(
        max=255)).clip(min=0).astype(np.uint8), map_mask[8], map_mask[9]), axis=0)
    type_map['VISUALIZATION'] = GeometricMap(data=map_mask_plot, homography=homography, description=', '.join(layer_names))

    scene.map = type_map
    del map_mask
    del map_mask_pedestrian
    del map_mask_vehicle
    del map_mask_plot

    for node_id in pd.unique(data['node_id']):
        node_frequency_multiplier = 1
        node_df = data[data['node_id'] == node_id]

        if node_df['x'].shape[0] < 2:
            continue

        if not np.all(np.diff(node_df['frame_id']) == 1):
            # print('Occlusion')
            continue  # TODO Make better

        node_values = node_df[['x', 'y']].values
        x = node_values[:, 0]
        y = node_values[:, 1]
        heading = node_df['heading'].values
        if node_df.iloc[0]['type'] == env.NodeType.VEHICLE and not node_id == 'ego':
            # Kalman filter Agent
            vx = derivative_of(x, scene.dt)
            vy = derivative_of(y, scene.dt)
            velocity = np.linalg.norm(np.stack((vx, vy), axis=-1), axis=-1)

            filter_veh = NonlinearKinematicBicycle(dt=scene.dt, sMeasurement=1.0)
            P_matrix = None
            for i in range(len(x)):
                if i == 0:  # initalize KF
                    # initial P_matrix
                    P_matrix = np.identity(4)
                elif i < len(x):
                    # assign new est values
                    x[i] = x_vec_est_new[0][0]
                    y[i] = x_vec_est_new[1][0]
                    heading[i] = x_vec_est_new[2][0]
                    velocity[i] = x_vec_est_new[3][0]

                if i < len(x) - 1:  # no action on last data
                    # filtering
                    x_vec_est = np.array([[x[i]],
                                          [y[i]],
                                          [heading[i]],
                                          [velocity[i]]])
                    z_new = np.array([[x[i + 1]],
                                      [y[i + 1]],
                                      [heading[i + 1]],
                                      [velocity[i + 1]]])
                    x_vec_est_new, P_matrix_new = filter_veh.predict_and_update(
                        x_vec_est=x_vec_est,
                        u_vec=np.array([[0.], [0.]]),
                        P_matrix=P_matrix,
                        z_new=z_new
                    )
                    P_matrix = P_matrix_new

            curvature, pl, _ = trajectory_curvature(np.stack((x, y), axis=-1))
            if pl < 1.0:  # vehicle is "not" moving
                x = x[0].repeat(max_timesteps + 1)
                y = y[0].repeat(max_timesteps + 1)
                heading = heading[0].repeat(max_timesteps + 1)
            global total
            global curv_0_2
            global curv_0_1
            total += 1
            if pl > 1.0:
                if curvature > .2:
                    curv_0_2 += 1
                    node_frequency_multiplier = 3*int(np.floor(total/curv_0_2))
                elif curvature > .1:
                    curv_0_1 += 1
                    node_frequency_multiplier = 3*int(np.floor(total/curv_0_1))

        vx = derivative_of(x, scene.dt)
        vy = derivative_of(y, scene.dt)
        ax = derivative_of(vx, scene.dt)
        ay = derivative_of(vy, scene.dt)

        if node_df.iloc[0]['type'] == env.NodeType.VEHICLE:
            v = np.stack((vx, vy), axis=-1)
            v_norm = np.linalg.norm(np.stack((vx, vy), axis=-1), axis=-1, keepdims=True)
            heading_v = np.divide(v, v_norm, out=np.zeros_like(v), where=(v_norm > 1.))
            heading_x = heading_v[:, 0]
            heading_y = heading_v[:, 1]

            data_dict = {('position', 'x'): x,
                         ('position', 'y'): y,
                         ('velocity', 'x'): vx,
                         ('velocity', 'y'): vy,
                         ('velocity', 'norm'): np.linalg.norm(np.stack((vx, vy), axis=-1), axis=-1),
                         ('acceleration', 'x'): ax,
                         ('acceleration', 'y'): ay,
                         ('acceleration', 'norm'): np.linalg.norm(np.stack((ax, ay), axis=-1), axis=-1),
                         ('heading', 'x'): heading_x,
                         ('heading', 'y'): heading_y,
                         ('heading', '°'): heading,
                         ('heading', 'd°'): derivative_of(heading, dt, radian=True)}
            node_data = pd.DataFrame(data_dict, columns=data_columns_vehicle)
        else:
            data_dict = {('position', 'x'): x,
                         ('position', 'y'): y,
                         ('velocity', 'x'): vx,
                         ('velocity', 'y'): vy,
                         ('acceleration', 'x'): ax,
                         ('acceleration', 'y'): ay}
            node_data = pd.DataFrame(data_dict, columns=data_columns_pedestrian)

        node = Node(node_type=node_df.iloc[0]['type'], node_id=node_id, data=node_data, frequency_multiplier=node_frequency_multiplier)
        node.first_timestep = node_df['frame_id'].iloc[0]
        if node_df.iloc[0]['robot'] == True:
            node.is_robot = True
            scene.robot = node

        scene.nodes.append(node)

    return scene
示例#33
0
file = read_args_tank_treading()
video = getInputFile(settings_name="extract_cell_snippets.py", video=file)
print(video)

config = getConfig(video)
config["channel_width_m"] = 0.00019001261833616293

data = getData(video)
getVelocity(data, config)

correctCenter(data, config)

data = data[(data.solidity > 0.96) & (data.irregularity < 1.06)]
data.reset_index(drop=True, inplace=True)

ids = pd.unique(data["cell_id"])

image_reader = CachedImageReader(video)

results = []
for id in tqdm.tqdm(ids):
    d = data[data.cell_id == id]

    crops, shifts, valid = getCroppedImages(image_reader, d)

    if len(crops) <= 1:
        continue

    crops = crops[valid]
    shifts = shifts[valid]
示例#34
0
def main(dirs,
         config_filename,
         map_filename=None,
         summary_filename=None,
         with_aid=True,
         with_target=True,
         phenotype=False,
         id_prefix='CID',
         output_format='.pkl.gz'):
    aids = set()
    targets = set()
    total = 0
    config = pd.read_csv(config_filename)
    summary = []
    sid_cid = None
    if map_filename is not None:
        sid_cid = read_sid_cid_map(map_filename)
    if 'aid' not in config.columns:
        raise ValueError('Configuration file must contain "aid" column.')
    assert len(config) == len(pd.unique(config['aid']))
    for this_dir in dirs:
        for filename in glob.glob(os.path.join(this_dir, '*.json.gz')):

            # get AID from filename so we only have to load relevant assays
            aid = int(os.path.basename(filename).split('.')[0])
            if aid not in config['aid'].values:
                continue

            # get configuration for this AID
            this_config = config[config['aid'] == aid].iloc[0]
            if not with_aid and 'aid' in this_config:
                del this_config['aid']
            if not with_target and 'target' in this_config:
                del this_config['target']

            # get data
            try:
                extractor = PcbaDataExtractor(filename,
                                              this_config,
                                              with_aid=with_aid)
            except NotImplementedError as e:
                warnings.warn(e.message)
                continue
            if phenotype and 'phenotype' not in extractor.config:
                warnings.warn('{} has no phenotype'.format(aid))
                continue
            assert aid == extractor.parser.get_aid(
            )  # sanity check for AID match
            aids.add(aid)
            target = extractor.config.get('target')
            targets.add(target)
            data = extractor.get_data(sid_cid=sid_cid)
            total += len(data)

            # add generic molecule ID column
            if id_prefix == 'CID':
                col = 'cid'
            elif id_prefix == 'SID':
                col = 'sid'
            else:
                raise NotImplementedError(
                    'Unrecognized ID prefix "{}"'.format(id_prefix))
            ids = []
            for i, mol_id in enumerate(data[col]):
                try:
                    ids.append(id_prefix + str(int(mol_id)))
                except (TypeError, ValueError):
                    warnings.warn('No ID for the following row:\n{}'.format(
                        data.loc[i]))
                    ids.append(None)  # can be found with pd.isnull

            # skip this assay if there are no valid IDs
            if np.all(pd.isnull(ids)):
                warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid))
                continue
            data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index)

            # add generic assay ID column
            assay_id = 'PCBA-' + str(aid)
            if with_aid:
                data.loc[:, 'assay_id'] = assay_id

            # save dataframe
            output_filename = '{}.{}'.format(assay_id, output_format)
            print '{}\t{}\t{}\t{}'.format(aid, target, output_filename,
                                          len(data))
            write_dataframe(data, output_filename)
            summary.append({
                'aid': aid,
                'target': target,
                'filename': output_filename,
                'size': len(data)
            })

    # make sure we found everything
    missing = set(config['aid']).difference(aids)
    if len(missing):
        warnings.warn('Missed AIDs {}'.format(missing))

    # save a summary
    summary = pd.DataFrame(summary)
    if summary_filename is not None:
        write_dataframe(summary, summary_filename)
    warnings.warn(
        'Found {} assays for {} targets ({} total data points)'.format(
            len(aids), len(targets), total))
def histAnimation(det_file, save_in=None, cam='c010', track_id=None):
    """
    Input:
    movie_path
        If movie path is a movie - load and extract frame - # TODO:
        If movie path is a folder - load images
    det_file : xml,txt or pkl
        panda list with the following column names
            ['frame','track_id', 'xmax', 'xmin', 'ymax', 'ymin']
    save_in
        folder to save output frames / movie -# TODO:

    """

    # Get BBox detection from list
    df = ut.getBBox_from_gt(det_file)
    if track_id is None:
        track_id = pd.unique(df['track_id'])
    # Create folders if they don't exist
    if save_in is not None and not os.path.isdir(save_in):
        os.mkdir(save_in)

    df.sort_values(by=['frame'])

    # create trajectory of all track
    df_track = df.groupby('track_id')
    index = np.linspace(0, 255, 31)
    colors = plt.cm.hsv(index / float(max(index)))
    for id, tt in df_track:
        first_frame = True
        print('Track id {}'.format(id))
        if id not in track_id:

            continue

        print('frames:')
        for t in tt.index.tolist():
            # 1st frame -
            ts = tt.loc[t, 'time_stamp']
            f = tt.loc[t, 'frame']
            print(f)
            hist = tt.loc[t, 'histogram']
            #print(hist)
            if first_frame:
                first_frame = False
                plt.ion()
                plt.show()
                fig = plt.figure()
                fig.suptitle('Camera {}, Track Id {}'.format(cam, id),
                             fontsize=16)
                ax = fig.add_subplot(111)

                ax.bar(range(len(hist)), hist, color=colors)
                #ax.bar(index, hist)
                ax.set_xlabel('Hue', fontsize=5)
                ax.set_ylabel('Probability', fontsize=5)
                #ax.set_xticklabels(range(len(hist)),index)
                ax.set_title('time {}'.format(ts))
            else:
                ax.clear()
                ax.bar(range(len(hist)), hist, color=colors)
                #ax.bar(index, hist)
                ax.set_xlabel('Hue', fontsize=5)
                ax.set_ylabel('Probability', fontsize=5)
                #ax.set_xticklabels(range(len(hist)),index)
                ax.set_title('time {}'.format(ts))

                if save_in is not None:

                    fig.savefig(os.path.join(save_in,
                                             'tk{}_f{}.png').format(id, f),
                                dpi=fig.dpi)

    return
示例#36
0
y_train = train['target']
train = train.drop(['target'], axis=1)
id_test = test['ID']

df_all = pd.concat((train, test), axis=0, ignore_index=True)
df_all['null_count'] = df_all.isnull().sum(axis=1).tolist()
df_all = df_all.fillna(-1)
df_all_temp = df_all['ID']
df_all = df_all.drop(['ID'], axis=1)
df_data_types = df_all.dtypes[:]  #{'object':0,'int64':0,'float64':0,'datetime64':0}
d_col_drops = []

for i in range(len(df_data_types)):
    if str(df_data_types[i]) == 'object':
        df_u = pd.unique(df_all[str(df_data_types.index[i])].ravel())
        print("Column: ", str(df_data_types.index[i]), " Length: ", len(df_u))
        d = {}
        j = 1000
        for s in df_u:
            d[str(s)] = j
            j += 5
        df_all[str(df_data_types.index[i]) + '_vect_'] = df_all[str(
            df_data_types.index[i])].map(lambda x: d[str(x)])
        d_col_drops.append(str(df_data_types.index[i]))
        if len(df_u) < 150:
            dummies = pd.get_dummies(df_all[str(
                df_data_types.index[i])]).rename(columns=lambda x: str(
                    df_data_types.index[i]) + '_' + str(x))
            df_all_temp = pd.concat([df_all_temp, dummies], axis=1)
示例#37
0
文件: toxicity.py 项目: dsfca/CDados
    for j in range(len(corr_mtx)):
        if ((corr_mtx.iat[i,j] > 0.75 or corr_mtx.iat[i,j] < -0.75) and i > j):
            if(not(i in corr_mtx and j in corr_mtx) and not(j in corr_mtx)):
                indexes.append(i)
                correlated = correlated + 1           
print(correlated)
print(indexes)

#Remove correlated variables
data = data.drop(data.columns[indexes], axis=1)
print(len(data.columns))

####
y: np.ndarray = data.pop('classification').values
X: np.ndarray = data.values
labels = pd.unique(y)


trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)


allknn = AllKNN()
nm = NearMiss()
smt = SMOTE()
ada = ADASYN(random_state=42)

lst = [allknn, nm, smt, ada]
gb = GradientBoostingClassifier(n_estimators=50, max_depth=10, learning_rate=0.5)

for samp in lst:
    trnX, trnY = samp.fit_resample(trnX, trnY)
示例#38
0
def robustOpr():

    # make the multiplier of the IQR in the boxplot outlier calculation easily
    # accessible for hyperparameter tuning
    # for the 2020 data, 1.6 is optimal rather than the default 1.5
    IQR_MULT = 1.6

    # get the event key from the command-line argument
    try:
        event = sys.argv[1]
    except IndexError:
        sys.exit(
            'ERROR - please specify an event key e.g. 2020scmb as the first argument'
        )

    # optionally get the TBA API read key from the second argument
    if len(sys.argv) >= 3:
        tbaKey = sys.argv[2]
    else:
        try:
            # check for tba_key.json and read the TBA API key from the file
            with open('tba_key.json', 'r') as read_file:
                f = json.load(read_file)
                tbaKey = f['tba_key']
        except OSError:
            sys.exit(
                'ERROR - must provide TBA API read key in a tba_key.json file or as the second argument'
            )

    # get data for a given event key from TheBlueAlliance API
    matches = getRawMatchData(event, tbaKey)

    # convert the match data to match-alliance data as needed by the OPR functions
    maData = matchToAlliance(matches)

    # truncate data to only qualification matches for OPR
    maData = maData[maData['comp_level'] == 'qm']

    # get unique list of teams within the match data as needed for OPR calcs
    teams = pd.unique(maData[['team1', 'team2',
                              'team3']].values.ravel('K')).tolist()
    # sort the team numbers for easier perusing of the OPR output
    # note that the sort is alphabetical since the team numbers are strings
    teams.sort()

    # initialize an index counter for an outlier removal loop
    i = 1

    # loop over OPR calculations - removing outlier match-alliance records in each
    # iteration and recomputing robust OPR - stop when no match-alliance records
    # are identified as outliers
    while True:

        # compute OPR - maData may be the original data for the full event or
        # could be on a truncated set of match-alliances after outlier removal
        # assuming that outlier removal does not remove all matches for any team
        # that the teams list needs to be recomputed during iteration
        opr = calcOPR(teams, maData)

        if i == 1:
            # save the OPR before outlier removal
            oprAll = opr.copy()

        # compute prediction errors for the OPR dataset
        maData = predictionError(maData, opr)

        # identify outlier match-alliance records using non-parametric boxplot
        # outlier computations - values more extreme than IQR_MULT times the
        # interquartile range outside the respective quartile are identified
        # as outliers
        # upper quartile is the 75th percentile of the data
        q3 = maData['score.errorS'].quantile(0.75)
        # lower quartile is the 25th percentile of the data
        q1 = maData['score.errorS'].quantile(0.25)
        # interquartile range is the difference between the upper and lower quartiles
        iqr = q3 - q1
        # high outlier limit is IQR_MULT * iqr above the upper quartile
        lim_hi = q3 + IQR_MULT * iqr
        # low outlier limit is IQR_MULT * iqr below the lower quartile
        lim_lo = q1 - IQR_MULT * iqr

        # look for outliers where the match-alliance prediction error is beyond
        # the outlier limits just calculated
        outliers = maData[(maData['score.errorS'] > lim_hi) |
                          (maData['score.errorS'] < lim_lo)]

        # if there are no outlier records, break out of the loop - the last OPR
        # calculated is the robust OPR
        if len(outliers) == 0:
            break

        # print to console if outliers are found
        print(f'Outlier(s) found on iteration {i}')
        for index, row in outliers.iterrows():
            print(
                f'Match {row.key} {row.color} ({row.team1} {row.team2} {row.team3}) - score {row.score} - pred {round(row["score.predS"], 1)}'
            )

        # find the indexes of the outlier records
        toDrop = list(outliers.index.values)
        # remove the outlier records from the qualification match-alliance dataset
        # before re-computing robust OPR on the next iteration
        maData.drop(toDrop, axis=0, inplace=True)

        # update the loop counter
        i += 1

        # run another iteration after outlier removal

    # prepare the OPR results for export
    # add event into the dataframe
    oprAll.insert(loc=0, column='event', value=event)
    # get needed columns from oprAll which is "standard" OPR
    oprAll = oprAll[['event', 'opr']]
    # give columns better names for export
    oprAll.columns = ['event', 'oprStd']
    # get needed columns from opr which is robust OPR
    opr = opr[['opr']]
    # give columns better names for export
    opr.columns = ['oprRobust']
    # combine standard and robust OPR data
    opr = pd.concat([oprAll, opr], axis=1, sort=False)
    # rearrange columns for easier human interpretation of the exported data
    opr = opr[['event', 'oprStd', 'oprRobust']]

    # round numbers in the dataframe to 1 place
    opr = opr.round(1)

    # export the results to CSV
    try:
        filename = 'robustOpr_' + event + '.csv'
        opr.to_csv(filename)
    except OSError:
        sys.exit(
            f'ERROR - output file could not be written - is {filename} open for editing?'
        )
示例#39
0
    'j11vote1', 'j11vote2', 'j11maj1', 'j11maj2', 'codej12', 'j12vote1',
    'j12vote2', 'j12maj1', 'j12maj2', 'codej13', 'j13vote1', 'j13vote2',
    'j13maj1', 'j13maj2', 'codej14', 'j14vote1', 'j14vote2', 'j14maj1',
    'j14maj2', 'codej15', 'j15vote1', 'j15vote2', 'j15maj1', 'j15maj2',
    'j16maj1', 'j16vote1'
]
#
# to_dummies = ['month','day','method','state','district','origin','source','distjudg',
#               'applfrom','adminrev','opinstat','treat','classact','crossapp','counsel1','counsel2','sanction',
#               'initiate','numappel','appnatpr','appnatpr','appbus','appnonp','appfed','appsubst','appstate',
#               'appfiduc','ap_stid','genapel1','bank_ap1','genapel2','bank_ap2','appel1','appel2',]
print df.shape
df.drop(labels=del_cols, axis=1, inplace=True)
moredropcolumns = df.columns.tolist()  # .tolist?
for i in moredropcolumns:
    if len(pd.unique(df[i])) == 1:
        df.drop(labels=i, axis=1, inplace=True)
caseList = pd.unique(df['casenum'])
caseList = caseList[pd.notnull(caseList)].tolist()
print len(caseList)
num_cores = multiprocessing.cpu_count()
print "num_cores is: ", num_cores


def do_to_case(case):
    newframe = pd.DataFrame()  ##  the rearrange of the original data
    output = [
    ]  ##   the corresponding alignment of judge 1 and judge 2, yes =1, no = -1
    subtest = df[df.casenum == case].reset_index(
        drop=True
    )  ## 'subtest' only take the records that have a specific case id
示例#40
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Initialisation:
a = pd.read_csv("/Users/michaelwehbe/Desktop/q.csv")
b = pd.read_csv("/Users/michaelwehbe/Desktop/w.csv")
c = pd.read_csv("/Users/michaelwehbe/Desktop/x.csv")

data_temp = pd.concat([a, b, c], axis=0)

n = len(data_temp['ID'])
n_secs = len(pd.unique(data_temp['ID']))
n_dates = len(pd.unique(data_temp['date']))

#Checking for NAs

if np.sum(data_temp['R'].isnull().sum()) == 0:
    print("No missing Values in the dataset")
else:
    print(np.sum(data_temp['R'].isnull().sum()), "in the dataset")

#Let's now sort the data by entering each security in a column, and each row would represent a date:

data_temp_2 = np.zeros((n_dates, n_secs))

for i in range(0, n_dates):  #This basically does what we described above.
    data_temp_2[i, :] = data_temp.iloc[i * n_secs:n_secs * (i + 1), 2]

data = pd.DataFrame(data_temp_2,
示例#41
0
def lagged_strat(lag):

    signal_lag, signal_temp_lag = lagged_signal(lag)
    signal_temp_lag_2 = pd.DataFrame(signal_temp_lag,
                                     columns=pd.unique(data_temp['ID']))

    #Strategy : Long top decile and short bottom decile
    #We will rank order our signals for each date:

    sorted_signal_lag = pd.DataFrame(np.sort(signal_lag, axis=1),
                                     index=pd.unique(data_temp['date']))

    #We have 690 securities, so 690 returns per date. so the first decile would be the smallest 69 returns and the 10th decile would be the largest 69 returns

    #Let's create two matrices, each with the IDs of the securities we are shorting or longing at each time:

    long_temp_lag = np.zeros((n_dates, 69))
    short_temp_lag = np.zeros((n_dates, 69))
    for i in range(0, n_dates):
        long_temp_lag[i, :] = signal_temp_lag_2.sort_values(
            by=i, axis=1).columns[n_secs - 69:n_secs]
        short_temp_lag[i, :] = signal_temp_lag_2.sort_values(
            by=i, axis=1).columns[0:69]

    long_positions_lag = pd.DataFrame(long_temp_lag.astype(int),
                                      index=pd.unique(data_temp['date']))
    short_positions_lag = pd.DataFrame(short_temp_lag.astype(int),
                                       index=pd.unique(data_temp['date']))

    #We want all the longs to have equal weight in our portfolio, and same for all the shorts
    #Hence each long security in the portfolio has weight 1/69 and short has -1/69, which satisfies all the given conditions

    #For simplicity of computations let's design a weight matrix:

    weights_lag = pd.DataFrame(np.zeros((n_dates, n_secs)),
                               index=pd.unique(data_temp['date']),
                               columns=pd.unique(data_temp['ID']))

    for i in range(0, n_dates):
        for j in range(0, 69):
            weights_lag[long_positions_lag.iloc[i, j]][i] = 1 / 69
            weights_lag[short_positions_lag.iloc[i, j]][i] = -1 / 69

    #Let's now compute the returns of our portfolio:

    portfolio_rets_temp_lag = np.array(
        weights_lag)[:n_dates - 1, :] * np.array(data)[1:, :]

    portfolio_rets_temp_lag_2 = []
    for i in range(0, n_dates - 1):
        portfolio_rets_temp_lag_2.append(np.sum(portfolio_rets_temp_lag[i, :]))

    portfolio_rets_lag = pd.DataFrame(portfolio_rets_temp_lag_2,
                                      columns=['Portfolio Returns'],
                                      index=pd.unique(data_temp['date'])[1:])

    #a
    #Let's compute the annualized mean return, volatility and sharpe ratio of the strategy and of the market portfolio:

    ann_mean_ret_strat_lag = np.mean(portfolio_rets_lag) * 252
    ann_vol_strat_lag = np.std(portfolio_rets_lag) * np.sqrt(252)
    ann_SR_strat_lag = ann_mean_ret_strat_lag / ann_vol_strat_lag  #Since we don't know what the risk free rate is

    return float(ann_mean_ret_strat_lag), float(ann_vol_strat_lag), float(
        ann_SR_strat_lag)
示例#42
0
        ofname = run(args, workload)
        if (not verify(ofname)):
            fail.append(workload + "_run.csv")
            continue

        okay.append(workload + "_load.csv")
        okay.append(workload + "_run.csv")

        df = stats(ofname)

        operations = [
            '[OVERALL]', '[READ]', '[INSERT]', '[CLEANUP]', '[UPDATE]',
            '[READ-MODIFY-WRITE]'
        ]
        found_ops = pd.unique(df[0])
        headers = [
            'Operations', 'MinLatency(us)', 'AverageLatency(us)',
            '95thPercentileLatency(us)', '99thPercentileLatency(us)',
            'MaxLatency(us)'
        ]
        printable_headers = [
            'Operations', '#ofOperations', 'MinLatency(us)',
            'AverageLatency(us)', '95thPercentileLatency(us)',
            '99thPercentileLatency(us)', 'MaxLatency(us)'
        ]
        overall_headers = [
            'CreatePmemPool(ms)', 'RunTime(ms)', 'Throughput(ops/sec)'
        ]
        printable_overall_headers = [
            'Overall', 'CreatePmemPool(ms)', 'RunTime(ms)',
示例#43
0
# import the dataset into the dataframe, using pandas
# data = pd.read_csv('covtype.csv', sep=';')
# dataSample = data.sample(frac=0.1)
# data = pd.read_csv('Undersample.csv', sep=',')
# dataSample = data  # usar 100% (são apenas 2k valores...)
# data = pd.read_csv('Oversample.csv', sep=',')
# dataSample = data.sample(frac=0.1)
data = pd.read_csv('SMOTE_sample.csv', sep=',')
dataSample = data.sample(frac=0.1)


# Data preparation for the classification models
y: np.ndarray = dataSample.pop('Cover_Type(Class)').values
X: np.ndarray = dataSample.values
labels = pd.unique(y)
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)


# For printing data
def model_performance(tstY, prdY, n, d=' ', k=' ', n_name='n', d_name=' ', k_name=' '):
    #    import warnings
    #    warnings.filterwarnings('default')  # "error", "ignore", "always", "default", "module" or "once"
    accuracy = metrics.accuracy_score(tstY, prdY)
    precision = metrics.precision_score(tstY, prdY, average='macro')
    sensibility = metrics.recall_score(tstY, prdY, average='macro')
    print('Accuracy :', str(accuracy)[:6], \
          ' precision: ', str(precision)[:6], \
          ' sensibility: ' + str(sensibility)[:6],
          n_name, n, d_name, d, k_name, k)
示例#44
0
Created on Fri Feb 12 12:28:53 2016

@author: Jleach1
"""

# %% import packages
import pandas as pd

# %%
d2 = pd.read_csv('../../../../data/d2_firm_level_data.csv')
d3 = pd.read_csv('../../../../data/d3_patent_data.csv')

# %% Compute collaboration
teams = d3.inv_num
teams_sets = [x.split(';') for x in teams]
invs = pd.unique([y for x in teams_sets for y in x])

# %% Reshape d3 to inventor level
d3_inv = pd.concat([d3.pnum, 
                    d3.inv_num.apply(lambda y: pd.Series(y.split(';')))], 
                    axis = 1)
                    
d3_inv = pd.melt(d3_inv, 
                 id_vars = 'pnum', 
                 value_vars = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,25],
                 value_name = 'inv_num')
d3_inv = d3_inv.drop('variable', axis = 1)
d3_inv.to_csv("../../../../data/outputs/d3_inv.csv", index = False)

# %%
inv_list = pd.unique([inv for inv in d3_inv.inv_num])
示例#45
0
                          header=None)
motif_found = motif_found.sort_values([0, 1, 2])
motif_found = motif_found.values

### global Bonferroni over all pairs tested across all cluster
pvalues = np.asarray(motif_found[:, 3])
# empty vectors for pvals
pvalues_Gbonf = np.zeros(motif_found.shape[0])
for i in range(0, motif_found.shape[0]):
    pvalues_Gbonf[i] = pvalues[i] * motif_found.shape[0]
    if pvalues_Gbonf[i] > 1:
        pvalues_Gbonf[i] = 1

# performed corrections for each cluster in the motif_found file (designated by values in first column)
# pd.unique retains order where np.unique does not
for clust in pd.unique(motif_found[:, 0]):
    print(clust)
    motifs = motif_found[motif_found[:, 0] == clust]
    pvalues = np.asarray(motifs[:, 3])
    # empty vectors for pvals
    pvalues_bonf = np.zeros(motifs.shape[0])
    pvalues_bh = np.zeros(motifs.shape[0])
    # calculate Bonferroni correction
    for i in range(0, motifs.shape[0]):
        pvalues_bonf[i] = pvalues[i] * motifs.shape[0]
        if pvalues_bonf[i] > 1:
            pvalues_bonf[i] = 1
    #calculate BH correction
    pvalues_bh = bh(pvalues)
    motifs = np.column_stack(
        (motifs[:, [0, 1, 2, 4, 5, 6, 3]], pvalues_bh, pvalues_bonf))
def generate_data(raw_data, output_dir, n_heldout_users, min_uc, min_sc):
    """Generates and writes train, validation and test data.

  The raw_data is first split into train, validation and test by user. For the
  validation set, each user's ratings are randomly partitioned into two subsets
  following a (80, 20) split (see split_train_test_proportion), and written to
  validation_tr.csv and validation_te.csv. A similar split is applied to the
  test set.

  Args:
    raw_data: a DataFrame of (userId, movieId, rating).
    output_dir: path to the output directory.
    n_heldout_users: this many users are held out for each of the validation and
      test sets.
    min_uc: filter out users with fewer than min_uc ratings.
    min_sc: filter out items with fewer than min_sc ratings.
  """
    raw_data, user_activity, item_popularity = filter_triplets(
        raw_data, min_uc, min_sc)
    sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] *
                                         item_popularity.shape[0])
    print('After filtering, there are %d watching events from %d users and %d '
          'movies (sparsity: %.3f%%)' %
          (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0],
           sparsity * 100))
    unique_uid = user_activity.index
    np.random.seed(98765)
    idx_perm = np.random.permutation(unique_uid.size)
    unique_uid = unique_uid[idx_perm]
    n_users = unique_uid.size
    tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
    vd_users = unique_uid[(n_users - n_heldout_users * 2):(n_users -
                                                           n_heldout_users)]
    te_users = unique_uid[(n_users - n_heldout_users):]
    train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]
    unique_sid = pd.unique(train_plays['movieId'])
    show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
    profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

    def numerize(tp):
        uid = [profile2id[x] for x in tp['userId']]
        sid = [show2id[x] for x in tp['movieId']]
        return pd.DataFrame(data={
            'uid': uid,
            'sid': sid
        },
                            columns=['uid', 'sid'])

    pro_dir = output_dir
    if not os.path.exists(pro_dir):
        os.makedirs(pro_dir)
    with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
        for sid in unique_sid:
            f.write('%s\n' % sid)
    vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
    vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]
    vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)
    test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
    test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]
    test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

    train_data = numerize(train_plays)
    train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

    vad_data_tr = numerize(vad_plays_tr)
    vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

    vad_data_te = numerize(vad_plays_te)
    vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

    test_data_tr = numerize(test_plays_tr)
    test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

    test_data_te = numerize(test_plays_te)
    test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)
示例#47
0
def merge_sub(sub1, sub2, bar, driver):
    if space_var.get() == 1:
        space_sub = '\n&nbsp;\n'
    else:
        space_sub = '\n'
    sub1_df = dataframe_sub(sub1, "en")
    sub2_df = dataframe_sub(sub2, "ru")
    df = pd.concat([sub1_df, sub2_df], axis=0)
    df['sum'] = df[['start', 'end']].sum(axis=1)
    df['plus'] = (df['start'] + df['end']) / 2
    df = df.sort_values(by='start', ascending=True)
    # агломеративная кластеризация
    if clusters_auto_var.get() == 1:
        clusters_list = []
        # оценка качества с помощью "силуэта"
        silhouette = []
        for i in np.linspace(0.2, 1, 20):
            root.update()
            threshold = float(i) * 10000
            clustering = AgglomerativeClustering(
                n_clusters=None,
                distance_threshold=threshold).fit(df[['start', 'end']])
            clusters = clustering.labels_
            clusters_list.append(len(pd.unique(clusters)))
            score = silhouette_score(df[['start', 'end']], clusters)
            silhouette.append(score)
        max_silhouette = np.argmax(silhouette)
        clustering = AgglomerativeClustering(
            n_clusters=clusters_list[max_silhouette]).fit(df[['start', 'end']])
    else:
        threshold = float(clusters_manual_entry.get()) * 10000
        clustering = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=threshold,
            linkage=clusters_method_combobox.get()).fit(df[['start', 'end']])
    clusters = clustering.labels_
    # добавление найденных кластеров
    df['cluster'] = clusters
    bar_subs = float(bar) / float(len(pd.unique(clusters)))
    # создание нового файла субтитров
    double_sub = pysrt.SubRipFile(encoding='utf-8')
    translate_list = pysrt.SubRipFile(encoding='utf-8')
    for n, i in enumerate(pd.unique(clusters)):
        root.update()
        progressBar['value'] += bar_subs
        df_en = df[(df['language'] == 'en') & (df['cluster'] == i)]
        df_ru = df[(df['language'] == 'ru') & (df['cluster'] == i)]
        df_group_en = df_en.groupby('cluster').agg({
            'text': ' '.join,
            'start': min,
            'end': max,
            'language': 'first'
        })
        df_group_ru = df_ru.groupby('cluster').agg({
            'text': ' '.join,
            'start': min,
            'end': max,
            'language': 'first'
        })
        df_group = df_group_en.merge(
            df_group_ru,
            on=['cluster', 'text', 'start', 'end', 'language'],
            how='outer').groupby('cluster').agg({
                'text': space_sub.join,
                'start': 'first',
                'end': 'first',
                'language': ''.join
            })
        sub = pysrt.SubRipItem(index=n + 1,
                               start=int(df_group.iloc[0]['start']),
                               end=int(df_group.iloc[0]['end']),
                               text=str(df_group.iloc[0]['text']))
        double_sub.append(sub)
        if translate_var.get() == 1 and df_group['language'].values == 'en':
            translate_list.append(sub)
    if translate_var.get() == 1 and translate_list:
        translate_sub(translate_list, bar, driver)
    # переиндексация субтитров
    double_sub.clean_indexes()
    return double_sub
示例#48
0
strings

vals = 'a,b, guido'
vals.split(" ")
import pandas as pd
import matplotlib.pyplot as plt 
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
values
dim
#Categorical data:
import numpy as np
import pandas as pd 
values = pd.Series(['apple',
'orange', 'apple', 'apples'] * 2)
pd.unique(values)
pd.value_counts(values)
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(["apple", "apples"])
dim.take(values)

dim2 = pd.Series(["apple", "apples", "orange"])
dim2.take(values)

#Categorical type in pandas:
fruits = ['apple', 'orange', 'apple', 'apples'] * 2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
'basket_id': np.arange(N),
'count': np.random.randint(3, 15, size = N),
'weight': np.random.uniform(0, 4, size = N)},
示例#49
0
            levels.append(levels[-1]+1)

        elif prev[1:].replace('RL','rl').islower() and current[1:].replace('RL','rl').isupper():

            metas.append(prev)
            
            levels.append(levels[-1]+1)

        elif prev[1:].replace('RL','rl').isupper() and current[1:].replace('RL','rl').isupper():

            metas.append(metas[-1])
            levels.append(levels[-1])

        elif prev[1:].replace('RL','rl').isupper() and current[1:].replace('RL','rl').islower():

            metas.append(pd.unique(metas)[-2])
            
            levels.append(levels[-1]-1)
        else:
            #Debuggaus kaiken varalta. Ei pitäisi mennä tänne.
            print(prev)
            print(current)
        prev = current



    df['meta']=metas

    dfs.append(df)
rikokset = pd.concat(dfs).iloc[:,:-1]
rikokset = rikokset.set_index('Alue')
def writeChemicalMatches():
    all_list_names = extract_flows_for_chemical_matcher()
    if len(all_list_names) == 0:
        log.error('no local flows found, chemical matches can not be assessed, '
                  'generate local inventories before continuing.')
        return

    # Determine whether to use the id or name to query SRS
    inventory_query_type = {"RCRAInfo": "list",
                            "TRI": "list",
                            "NEI": "list",
                            "eGRID": "name",
                            "DMR": "list",
                            "GHGRP": "name"}

    # Create a df to store the results
    all_lists_srs_info = pd.DataFrame(columns=["FlowName", "SRS_ID",
                                               "SRS_CAS", "Source"])
    errors_srs = pd.DataFrame(columns=["FlowName", "Source", "ErrorType"])

    # Loop through sources, querying SRS by the query type defined for the
    # source, merge the results with the flows for that inventory.
    # Store errors in a separate dataframe
    sources = list(pd.unique(all_list_names['Source']))
    for source in sources:
        log.info('accessing SRS for ' + source)
        # Get df with inventory flows
        inventory_flows = all_list_names[all_list_names['Source'] ==
                                         source].reset_index(drop=True)

        if inventory_query_type[source] == 'list':
            # make sure flowid is a string
            inventory_flows['FlowID'] = inventory_flows['FlowID'].map(str)
            # query SRS to get entire list and then merge with it
            list_srs_info = get_SRSInfo_for_program_list(source)
            # merge this with the original list using FlowID
            list_srs_info = pd.merge(inventory_flows, list_srs_info,
                                     left_on='FlowID', right_on='PGM_ID',
                                     how='left')
        elif inventory_query_type[source] == 'name':
            # For names, query SRS one by one to get results
            list_srs_info = pd.DataFrame(columns=["FlowName", "SRS_ID",
                                                  "SRS_CAS", "Source"])
            errors_srs = pd.DataFrame(columns=["FlowName", "Source", "ErrorType"])
            # Cycle through names one by one
            for index, row in inventory_flows.iterrows():
                chemical_srs_info = pd.DataFrame(columns=["FlowName", "SRS_ID",
                                                          "SRS_CAS", "Source"])
                error_srs = pd.DataFrame(columns=["FlowName", "Source",
                                                  "ErrorDescription"])
                name = row["FlowName"]
                result = get_SRSInfo_for_substance_name(name)
                if isinstance(result, str):
                    # This is an error
                    error_srs.loc[0, 'FlowName'] = name
                    #error_srs.loc[0, 'FlowID'] = id
                    error_srs.loc[0, 'Source'] = source
                    error_srs.loc[0, 'ErrorDescription'] = result
                else:
                    chemical_srs_info = result
                    chemical_srs_info.loc[0, "FlowName"] = name
                    #chemical_srs_info.loc[0, "FlowID"] = name
                    chemical_srs_info.loc[0, "Source"] = source

                errors_srs = pd.concat([errors_srs, error_srs], sort=False)
                list_srs_info = pd.concat([list_srs_info, chemical_srs_info],
                                          sort=False)

        all_lists_srs_info = pd.concat([all_lists_srs_info, list_srs_info],
                                       sort=False)

    # Remove waste code and PGM_ID
    all_lists_srs_info = all_lists_srs_info.drop(columns=['PGM_ID'])
    all_lists_srs_info = all_lists_srs_info.sort_values(['Source', 'FlowName',
                                                         'SRS_ID', 'FlowID'])

    # Add in manually found matches
    all_lists_srs_info = add_manual_matches(all_lists_srs_info)

    subset = ['FlowID', 'FlowName', 'Source']

    # Write to csv
    all_lists_srs_info = all_lists_srs_info[['FlowID', 'FlowName', 'SRS_CAS',
                                             'SRS_ID', 'Source']].drop_duplicates(subset)
    all_lists_srs_info.to_csv(OUTPUT_PATH
                              .joinpath('ChemicalsByInventorywithSRS_IDS_forStEWI.csv'),
                              index=False)
    #errors_srs.to_csv('work/ErrorsSRS.csv',index=False)

    # Write flows missing srs_ids to file for more inspection
    flows_missing_SRS_ID = all_lists_srs_info[all_lists_srs_info['SRS_ID'].isnull()]
    flows_missing_SRS_ID.to_csv(OUTPUT_PATH
                                .joinpath('flows_missing_SRS_ID.csv'),
                                index=False)
示例#51
0
# PRE vs HIGH
# data = data.loc[(data["Condition"] != "Low") & (data["Condition"] != "Post")]

X = data.iloc[:, 15:]

# PRE & POST vs LOW vs HIGH
# y = (data["Condition"] == "Low").astype(int) + (data["Condition"] == "High").astype(int) * 2

# LOW vs HIGH or PRE vs HIGH
y = (data["Condition"] == "High").astype(int)

# PRE vs LOW
# y = (data["Condition"] == "Low").astype(int)

hashes = pd.unique(data["hash"])


# %%
def cv_generator():
    for hash in hashes:
        cond = data["hash"] == hash
        yield np.where(~cond)[0], np.where(cond)[0]


parameters = {
    "svc__kernel": ["rbf"],
    "svc__gamma": [1000**i for i in np.linspace(-1, 1, 1000)],
    "svc__C": [1000**i for i in np.linspace(-1, 1, 1000)]
}
示例#52
0
# degreeTwo.py
# Aakash Indurkhya
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import itertools as itools
import networkx as nx
import sys
import pickle

data = pd.read_csv(sys.argv[1])
antenna_map = {}
antenna_map_c = {}
G = nx.Graph()
antennas = list(pd.unique(data['antenna_id']))

for a in antennas:
    G.add_node(a)

pairs = list(itools.combinations(antennas, 2))
for pair in pairs:
    # print pair
    id1, id2 = pair
    if id1 < id2:
        a = id1
        b = id2
    else:
        a = id2
        b = id1
    antenna_map[(a, b)] = 0
    antenna_map_c[(a, b)] = 0
示例#53
0
 def get_column(self, features):
     return list(pd.unique(list(features.values())))
for district_assignment_col in district_column_names:

    agg_data = data.dissolve(by=district_assignment_col, aggfunc='sum')

    # ---- Geographic Metrics ------

    # County Splits
    if cong:
        threshold_pop = cong_factor * cong_seat_pop
    else:
        total_pop = sum(data[pop_col])
        threshold_pop = leg_factor * total_pop / num_dist

    num_splits = 0
    lower_county_splits = 0
    counties = list(filter(None, pd.unique(data[county_assignment_col])))

    for county in counties:
        county_subset = data[data[county_assignment_col] == county]
        split_times = len(pd.unique(county_subset[district_assignment_col]))
        if split_times > 1:
            num_splits = num_splits + 1
        county_pop = sum(county_subset[pop_col])
        if county_pop > threshold_pop:
            lower_county_splits = lower_county_splits + 1

    upper_county_splits = len(counties)

    county_splits_scores.append(num_splits)

    # --- Compactness -----
# data = data[data['city'].isin(a)]
# a = ['Apartment','House','Townhouse','Condominium','Serviced apartment','Villa','Guesthouse','Guest suite','Bed and breakfast','Loft','Bungalow','Cottage']
# data = data[data['property_type'].isin(a)].reset_index(drop=True)
a = ['Entire home/apt','Private room']
data = data[data['room_type'].isin(a)].reset_index(drop=True)
a = ['strict_14_with_grace_period','moderate','flexible']
data = data[data['cancellation_policy'].isin(a)].reset_index(drop=True)
# a = ['Central Business District','Southbank','St Kilda','South Yarra','Docklands','Carlton','Richmond','Brunswick','Fitzroy','Collingwood','South Melbourne']
# data = data[data['neighborhood'].isin(a)].reset_index(drop=True)
# data = data.replace({'strict_14_with_grace_period': 'strict'})
# data = data.drop(columns = ['neighborhood','room_type','cancellation_policy'])

for c in category_features:
    one_hot = pd.get_dummies(data[c])
    # print(one_hot.shape[1])
    uniq = pd.unique(data[c])
    # print(c)
    # print(data[c].value_counts())
    # print(uniq)
    # print(len(uniq))
    F, p_value = stats.f_oneway(*(data[data[c] == u]['price'] for u in uniq))
    # print(c, F, p_value)
    data = data.drop(columns=c, axis=1)
    if p_value <= 0.05:
        # print(c)
        data = data.join(one_hot)
#######################################################################################################################
#calculate correlations between numerical varaibles and target variable, to find important ones
for c in numerical_features:
    corr = np.corrcoef(data['price'], data[c])
    print(c, corr[0,1])
示例#56
0
    for w in tokens:
        if w not in stop_words:
            filtered_string.append(w)

    stemmed_string = []
    for w in filtered_string:
        stemmed_string.append(ps.stem(w))

    return " ".join(stemmed_string)


data = pd.read_csv("hack.csv")
data = data[data["Tier"] == "Total"]
columns = ["".join(process_str(i)) for i in list(data.columns)]
states = [ps.stem(str(i)) for i in pd.unique(data["State"]).tolist()]
districts = [ps.stem(str(i)) for i in pd.unique(data["District"]).tolist()]


def analyze(string):
    try:
        our_str = process_str(string)
        val = max([[process.extractOne(our_str, i[0]), i[1]]
                   for i in [[states, "State"], [districts, "District"]]],
                  key=lambda x: x[0][1])

        cat = val[1] if val[0][1] >= 85 else "Nation"
        if cat == "State":
            val = pd.unique(data["State"]).tolist()[states.index(
                val[0][0])] if val[0][1] >= 70 else ""
        elif cat == "District":
def print_overlap_of_algorithm(name, all_pairs_unsorted, co_changes_unsorted, include_class_level=True, include_package_level=True, calculate_chi_square=True, calculate_precede_values=True):
    print("--- Overlap ", name, " co-changes and smells: ---")
    all_pairs_unsorted = all_pairs_unsorted.drop(['file1Size', 'file2Size'], axis=1)
    co_changes_unsorted = co_changes_unsorted.drop(['startdate', 'enddate', 'Unnamed: 0'], axis=1)
    # Class level data
    all_pairs_no_package = all_pairs_unsorted.drop(['package1', 'package2'], axis=1)  # This drops rows without both packages. May only be done for class-level analysis
    all_pairs_no_package = order_file1_and_file2(all_pairs_no_package)
    cc_pairs_no_package = co_changes_unsorted.drop(['package1', 'package2'], axis=1)  # This drops rows without both packages. May only be done for class-level analysis
    cc_pairs_no_package = order_file1_and_file2(cc_pairs_no_package)

    class_smell_pairs_with_date = pd.DataFrame(columns=['file1', 'file2'])
    if include_class_level:
        class_smell_pairs_with_date = load_pickle("class_smell_pairs_with_date")
        if class_smell_pairs_with_date is None:
            class_smell_pairs_with_date = order_file1_and_file2(get_project_class_smells_in_range(calculate_precede_values))  # df: file1, file2
            # Find file pairs that are part of the same class-level smell:
            class_smell_pairs_with_date = join_helper.perform_chunkified_pair_join(all_pairs_no_package, class_smell_pairs_with_date, level='file', compare_dates=False)
            save_pickle(class_smell_pairs_with_date, "class_smell_pairs_with_date")

    del all_pairs_no_package
    gc.collect()
    class_smell_pairs_with_date.info(verbose=False, memory_usage="deep")

    # Package level data
    all_pairs_unsorted.dropna(inplace=True)
    co_changes_unsorted.dropna(inplace=True)
    all_pairs_with_package = order_package1_and_package2(order_file1_and_file2(all_pairs_unsorted))
    cc_pairs_with_package = order_package1_and_package2(order_file1_and_file2(co_changes_unsorted))
    del all_pairs_unsorted
    del co_changes_unsorted
    gc.collect()


    package_smell_pairs_with_date = pd.DataFrame(columns=['file1', 'file2'])
    if include_package_level:
        package_smell_pairs_with_date = load_pickle("package_smell_pairs_with_date")
        if package_smell_pairs_with_date is None:
            package_smell_pairs_with_date = order_package1_and_package2(get_project_package_smells_in_range(calculate_precede_values))  # df: package1, package2
            # We want to find file pairs whose package are part of the same smell:
            package_smell_pairs_with_date = join_helper.perform_chunkified_pair_join(all_pairs_with_package, package_smell_pairs_with_date, level='package', compare_dates=False)
            # Note: we are interested in (file1, file2) in package_smell_pairs

            save_pickle(package_smell_pairs_with_date, "package_smell_pairs_with_date")

    package_smell_pairs_with_date.info(verbose=False, memory_usage="deep")
    del all_pairs_with_package
    gc.collect()
    # Combine the pairs
    df_list = [class_smell_pairs_with_date, package_smell_pairs_with_date]
    smell_pairs_with_date = pd.concat(df_list)

    del class_smell_pairs_with_date
    del package_smell_pairs_with_date
    gc.collect()

    smell_pairs_with_date.info(verbose=False, memory_usage="deep")

    if include_class_level:
        # Overlapping pairs contains at least: file1, file2, parsedSmellFirstDate, parsedSmellLastDate, parsedStartDate, parsedEndDate
        overlapping_cc_smells = join_helper.perform_chunkified_pair_join(cc_pairs_no_package, smell_pairs_with_date)
    else:
        # Overlapping pairs contains at least: file1, file2, parsedSmellFirstDate, parsedSmellLastDate, parsedStartDate, parsedEndDate
        overlapping_cc_smells = join_helper.perform_chunkified_pair_join(cc_pairs_with_package, smell_pairs_with_date)

    overlapping_cc_smells.info(verbose=False, memory_usage="deep")
    del smell_pairs_with_date
    gc.collect()

    # RQ4: Are smells introduced before or after files start co-changing?
    if calculate_precede_values and len(overlapping_cc_smells) > 0:
        # Filter smells and co-changes which are already present at the start of the analysis. We are not sure what their real start date is.
        overlapping_cc_smells.drop(['parsedVersionDate', 'package1', 'package2'], axis=1, inplace=True)
        overlapping_cc_smells.info(verbose=False, memory_usage="deep")
        gc.collect()
        print("unfiltered:", len(overlapping_cc_smells))
        overlapping_cc_smells = overlapping_cc_smells[overlapping_cc_smells['parsedSmellFirstDate'].dt.floor('d') != analysis_start_date.date()]
        gc.collect()
        print("after filtering smells: ", len(overlapping_cc_smells))  # Note: this counts joined rows
        overlapping_cc_smells = overlapping_cc_smells[overlapping_cc_smells['parsedStartDate'].dt.floor('d') != analysis_start_date.date()]
        gc.collect()
        print("filtered ccs: ", len(overlapping_cc_smells))

        # Compare the two start dates and count which is earlier how often. Also count ties!
        # group by: file1, file2, smellId

        earlier_smell_rows = overlapping_cc_smells[
            overlapping_cc_smells['parsedSmellFirstDate'].dt.floor('d') < overlapping_cc_smells[
                'parsedStartDate'].dt.floor('d')]
        earlier_smell_pairs = len(pd.unique(earlier_smell_rows[['file1', 'file2', 'uniqueSmellID']].values.ravel('K')))
        add_result(project_name, name + "_earlier_smell_pairs", earlier_smell_pairs)
        del earlier_smell_rows
        gc.collect()

        earlier_ccs_rows = overlapping_cc_smells[
            overlapping_cc_smells['parsedStartDate'].dt.floor('d') < overlapping_cc_smells[
                'parsedSmellFirstDate'].dt.floor('d')]
        earlier_ccs_pairs = len(pd.unique(earlier_ccs_rows[['file1', 'file2', 'uniqueSmellID']].values.ravel('K')))
        add_result(project_name, name + "_earlier_ccs_pairs", earlier_ccs_pairs)

        del earlier_ccs_rows
        gc.collect()

        tied_rows = overlapping_cc_smells[
            overlapping_cc_smells['parsedStartDate'].dt.floor('d') == overlapping_cc_smells[
                'parsedSmellFirstDate'].dt.floor('d')]
        tied_pairs = len(pd.unique(tied_rows[['file1', 'file2', 'uniqueSmellID']].values.ravel('K')))
        add_result(project_name, name + "_tied_pairs", tied_pairs)

    elif calculate_precede_values and len(overlapping_cc_smells) == 0:
        add_result(project_name, name + "_earlier_smell_pairs", 0)
        add_result(project_name, name + "_earlier_ccs_pairs", 0)
        add_result(project_name, name + "_tied_pairs", 0)
示例#58
0
s_cols = ['424']
for s_col in s_cols:
    temp[s_col] = temp[s_col].apply(lambda x : unit_transform_xt(str(x)) if pd.notnull(x) else x)
    temp[s_col] = temp[s_col].apply(lambda x : unit_transform_xt2(str(x)) if pd.notnull(x) else x)
    temp[s_col] = temp[s_col].apply(lambda x : unit_transform_xt3(str(x)) if pd.notnull(x) else x)


for col in cols:
    if (np.array(temp[col]).dtype) == 'object':
        obj_list.append(col)
        try:
            temp[col] = temp[col].apply(lambda x : float(x) )
        except:
            print (col)
            obj_list_4.append(col)
            print (pd.unique(temp[col]))

            
            
dealcol=[1325,  425 , 437 ,3191 , 547 , 1321,  3203,  2233,  3485 , 30007 , 549, 424  ,459101 , 2229 ,901  ,1322 ,1326 ,3429 ,3430 , 459102 , 3194  ,3198 , 733, 212 , 2302]
dealcol=[str(i) for i in dealcol]
unhealth=temp[dealcol].select_dtypes(include=['object'])
for i in unhealth.columns:
    print(i+'*****')
    print(unhealth[i].unique())

dropcol=['547',   '2302',  '733',]
temp=temp.drop(dropcol,axis=1)

num=data.select_dtypes(include=['float64'])
result=pd.concat([temp, num], axis=1)
示例#59
0
submit_df['msg_date'] = pd.to_datetime(submit_df['msg_date'])
submit_date = submit_df['msg_date'].dt.weekday
submit_date = submit_date.apply(convert_date)
submit_date = submit_date.value_counts().reindex(
    ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

channel_submit = submit_df.groupby('channel_name').size()
channel_submit = channel_submit.reset_index().rename(columns={0: 'count'})
st.title("DataCracy Submit Dashboard")

matplotlib.rcParams['font.family'] = "sans-serif"
matplotlib.rcParams['font.sans-serif'] = "Open Sans"
plt.rcParams['patch.edgecolor'] = 'black'

select = list(pd.unique(submit_df['DataCracy_role']))
select.append('All Group')
option = st.selectbox('Choose submit by group', sorted(select))

if option == 'Learner_Gr1':
    st.write(f'Group 1 submited {len_submit_1} assignments')
    st.write(f'Group 1 have {len_review_1} submit be reviewed')
    sizes_all = np.array([len_review, len_submit - len_review])
    sizes_gr1 = np.array([len_review_1, len_submit_1 - len_review_1])

    def func(pct, allvals):
        absolute = int(round(pct / 100. * np.sum(allvals)))
        return "{:.1f}%\n({:d})".format(pct, absolute)

    # sizes =[len_submit, len_submit-len_review]
示例#60
0
def con3(query):
    sparql = SPARQLWrapper("http://dbtune.org/musicbrainz/sparql")
    print(query)
    construct_query = """
            PREFIX mo: <http://purl.org/ontology/mo/>
            PREFIX mbz: <http://purl.org/ontology/mbz#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>    
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX bio: <http://purl.org/vocab/bio/0.1/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX tags: <http://www.holygoat.co.uk/owl/redwood/0.1/tags/>
            PREFIX geo: <http://www.geonames.org/ontology#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            PREFIX lingvoj: <http://www.lingvoj.org/ontology#>
            PREFIX rel: <http://purl.org/vocab/relationship/>
            PREFIX vocab: <http://dbtune.org/musicbrainz/resource/vocab/>
            PREFIX event: <http://purl.org/NET/c4dm/event.owl#>
            PREFIX map: <file:/home/moustaki/work/motools/musicbrainz/d2r-server-0.4/mbz_mapping_raw.n3#>
            PREFIX db: <http://dbtune.org/musicbrainz/resource/>
            PREFIX foaf: <http://xmlns.com/foaf/0.1/>
            PREFIX dc: <http://purl.org/dc/elements/1.1/>
            SELECT DISTINCT  ?v1 ?v2 ?v3 ?v4 
            WHERE { 
            ?r rdfs:label \""""
    construct_query = construct_query + query
    construct_query = construct_query + """\" .
            ?r dc:title ?v1 .
            ?r mo:track_number ?v2 .
            ?r foaf:maker ?v3 .
            ?v3 rdfs:label ?v4
            }
            ORDER BY ?v2 ?v4
            LIMIT 100"""

    sparql.setQuery(construct_query)
    sparql.setReturnFormat(JSON)
    a = sparql.query().convert()
    b = a["results"]["bindings"]

    Art = []
    t_no = []
    url = []
    for m in b:
        Art.append(m["v4"]["value"])
        t_no.append(m["v2"]["value"])
        url.append(m["v3"]["value"])

    d = []
    d = pd.unique(Art)
    Albums = []
    for i in range(len(d)):
        Temp = []
        for j in range(len(Art)):
            if (Art[j] == d[i]):
                t = []
                t.append(Art[j])
                t.append(t_no[j])
                t.append(url[j])
                Temp.append(t)
        Albums.append(Temp)

    return Albums[0:19]