def logistic_test_using_cosine(score_feature=False): logger.info('using cosine features in logistic regression') if score_feature: logger.info('also use score feature') Cs = [2**t for t in range(0, 10, 1)] Cs.extend([3**t for t in range(1, 10, 1)]) snli2cosine = SNLI2Cosine('/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin') logger.info('loading snli data ...') train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t') train_df = train_df[pd.notnull(train_df.sentence2)] train_df = train_df[train_df.gold_label != '-'] train_df = train_df[:(len(train_df) / 3)] train_df.reset_index(inplace=True) test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t') test_df = test_df[pd.notnull(test_df.sentence2)] test_df = test_df[test_df.gold_label != '-'] test_df.reset_index(inplace=True) X_train, train_labels, X_test, test_labels = snli2cosine.calculate_cosine_features(train_df, test_df) if score_feature: y_train_proba, y_test_proba = joblib.load('./snli/logistic_score_snli.pkl') # y_train_proba = y_train_proba.flatten() # y_test_proba = y_test_proba.flatten() X_train = np.concatenate([X_train, y_train_proba.reshape((-1, 1))], axis=1) X_test = np.concatenate([X_test, y_test_proba.reshape((-1, 1))], axis=1) logger.info('X_train.shape: {0}'.format(X_train.shape)) logger.info('X_test.shape: {0}'.format(X_test.shape)) logreg = LogisticRegressionCV(Cs=Cs, cv=3, n_jobs=10, random_state=919) logreg.fit(X_train, train_labels) logger.info('best C is {0}'.format(logreg.C_)) y_test_predicted = logreg.predict(X_test) acc = accuracy_score(test_labels, y_test_predicted) logger.info('test data predicted accuracy: {0}'.format(acc))
def validate_label_generation(): mals1_df = pd.read_csv('data/sorted-train-labels-vs251-252.csv') mals2_df = pd.read_csv('data/sorted-train-labels-vs263-264-apt.csv') counter = 0 m1_x = np.array(mals1_df['malware_type_x']) m1_f = np.array(mals1_df['family_name']) m1_sl = np.array(mals1_df['sample_label']) m1_fl = np.array(mals1_df['family_label']) m2_x = np.array(mals2_df['malware_type_x']) m21_f = np.array(mals2_df['family_name']) m2_sl = np.array(mals2_df['sample_label']) m2_fl = np.array(mals2_df['family_label']) for idx1, mname1 in enumerate(m1_x): for idx2, mname2 in enumerate(m2_x): if mname1 == mname2: if m1_sl[idx1] != m2_sl[idx2]: print("Sample label incongruence: {:d} {:d}".format(m1_sl[idx1], m2_sl[idx2])) counter += 1 if (m1_fl[idx1] != m2_fl[idx2]): print("Family label incongruence: {:d} {:d}".format(m1_fl[idx1], m2_fl[idx2])) counter += 1 if (idx1 % 1000) == 0: print("Processed {:d} malware names.".format(idx1)) print("Total Incongruence Errors: {:d}".format(counter)) return
def __init__(self, path_to_file, batch_size=32, skip_header=False, column_id=0, column_label=1, column_path=2): """ Constructor. Just reads the file and creates the two lists to be used. :param path_to_file: Where the file resides :param batch_size: how many image to return as per minibatch :param skip_header: Does the file have a header? :param column_id: Column number on where the item ID resides :param column_label: Column number on where the label is stored :param column_path: Column number to get the relative path """ try: if skip_header: corpus_df = pd.read_csv(path_to_file, header=None) else: corpus_df = pd.read_csv(path_to_file) except OSError: raise TK1CorpusBuilderError("{} not found".format(path_to_file)) # let' shuffle the corpus corpus_df = corpus_df.sample(frac=1).reset_index(drop=True) #allright, let' store the lists then self.batch_size = batch_size self.ids = corpus_df[corpus_df.columns[column_id]].values self.labels = corpus_df[corpus_df.columns[column_label]].values self.image_path = corpus_df[corpus_df.columns[column_path]].values # we'e done here, deleting stuff del corpus_df
def file2dataframe(): dir = "C:\\Users\\wyq\\Desktop\\WikiDataAnalyse\\data\\target_prediction\\" links = pd.read_csv(dir + 'links.tsv', sep='\t', header=None) paths = pd.read_csv(dir + 'paths_finished.tsv', sep='\t') paths["path"] = paths["path"].apply(lambda x: x.split(';')) vectors = normalize() return links, paths, vectors
def generate_sample_labels(av_report_file, out_report_file, label_file): mals = pd.read_csv(av_report_file) labels = pd.read_csv(label_file) # Now generate unique scalar label map, we will use WinDefender as the default classification, if WinDefender is OK # and ClamAV is not OK, then use the ClamAV classification, if both are OK then default to 0 label value for now. type_x = np.array(mals['malware_type_x']) type_y = np.array(mals['malware_type_y']) scalar_labels = [0] * mals.shape[0] counter, scalar_label_map = get_sample_labels(mals, labels) # Get the malware label dict. for idx, y_val in enumerate(type_y): if y_val != 'OK': mals.iloc[idx,1] = y_val # copy the defender classification to ClamAV classification # Now update the label map with a new scalar label values if mals.iloc[idx,1] not in scalar_label_map.keys(): counter += 1 scalar_label_map[mals.iloc[idx,1]] = counter # now get the scalar label for this malware sample scalar_labels[idx] = scalar_label_map[mals.iloc[idx,1]] if (idx % 1000) == 0: # report progress print("Processed label: {:d} {:s} -> {:d}.".format(idx, mals.iloc[idx,1], scalar_labels[idx])) mals['sample_label'] = scalar_labels mals.to_csv(out_report_file, index=False) save_sample_labels(scalar_label_map) return
def main(): res = [] num_iterations = params['num_iterations'] early_stopping_round = params['early_stopping_round'] print(params) for i in range(cnt): train_fea = pd.read_csv(root_path + 'train_score_{}.csv'.format(i)) train_lab = pd.read_csv(root_path + 'label_{}.csv'.format(i)) train_lab = train_lab.loc[:, 'label'].values lgb_train = lgb.Dataset(train_fea, train_lab) solver = lgb.train(params, lgb_train, \ valid_sets=[lgb_train], \ valid_names=['train'], \ verbose_eval=True, \ num_boost_round=num_iterations, \ early_stopping_rounds=early_stopping_round) pred_fea = pd.read_csv(root_path + 'res_score.csv') pred_fea = pred_fea.drop([i], axis=1).values res.append(solver.predict(pred_fea, num_iteration=solver.best_score)) pd.DataFrame(np.array(res).T).to_csv(root_path + \ 'res_score2.csv', index=False) res = np.mean(res, axis=0) pred_pair = pd.read_csv(root_path + 'test1.csv') pred_pair['score'] = res pred_pair['score'] = pred_pair['score'].apply(lambda x: '{:.6f}'.format(x)) pred_pair.to_csv(root_path + 'submission-5000-layer2.csv', index=False)
def gera(nome_teste, nome_pred): pred = pd.read_csv('dados/'+ nome_teste, delimiter=' ', usecols=[0, 1], header=None, names=['alvo', 'preco']) out = pd.read_csv('dados/'+ nome_pred, delimiter=' ', usecols=[0], header=None, names=['resultado']) print len(pred) print len(out) errosx = [] errosy = [] acertosx = [] acertosy = [] precosx = [] precosy = [] for i in range(0, len(pred)): precosx.append(i) precosy.append(float(pred['preco'][i][2:])) if pred['alvo'][i] == out['resultado'][i]: acertosx.append(i) acertosy.append(float(pred['preco'][i][2:])) else: errosx.append(i) errosy.append(float(pred['preco'][i][2:])) plt.plot(precosx, precosy) plt.plot(errosx, errosy, 'rx') plt.plot(acertosx, acertosy, 'x') plt.show()
def getData(folderList, shapes, trips, stopTimes, calendar, frequencies): for folder in folderList: print('Adding data from ' + folder + '.') # Read the files from the data. readShapes = pd.read_csv('../' + folder + '/shapes.txt')[shapeData] readTrips = pd.read_csv('../' + folder + '/trips.txt')[routeData] readStopTimes = pd.read_csv('../' + folder + '/stop_times.txt')[timeData] readCalendar = pd.read_csv('../' + folder + '/calendar.txt')[calendarData] # Append it to the existing data. shapes = pd.concat([shapes, readShapes]) trips = pd.concat([trips, readTrips]) stopTimes = pd.concat([stopTimes, readStopTimes]) calendar = pd.concat([calendar, readCalendar]) if os.path.isfile('../' + folder + '/frequencies.txt'): readFrequencies = pd.read_csv('../' + folder + '/frequencies.txt') frequencies = pd.concat([frequencies, readFrequencies]) # Calculate the number of missing shapes. num_shapes = trips.groupby('route_id').size() num_validshapes = trips[trips.shape_id.isin(shapes.shape_id)].groupby('route_id').size() num_missingshapes = num_shapes - num_validshapes percent_missingshapes = num_missingshapes / num_shapes * 100 print('Missing data from ' + folder + ':') num_missingshapesList = num_missingshapes[num_missingshapes != 0] if num_missingshapes.empty: print(num_missingshapes[num_missingshapes != 0]) print(percent_missingshapes[percent_missingshapes != 0]) else: print('No data missing.\n') return lists(shapes, trips, stopTimes, calendar, frequencies)
def main(output=RESULTS1B): """ Using 1 nearest neighbor, predicts NYC Taxi trip times based on feature vectors (pickup latitude, pickup longitude, dropoff latitude, dropoff latitude). Tests on a subset of trip_data_1.csv Uses sklearn to implement nearest neighbors """ features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'trip_time_in_secs'] ## Extract necessary data into pandas dataframes numrows = 100000 df_train_read = pd.read_csv(TRAIN_DATA) df_test_read = pd.read_csv(TRIP_DATA_1, nrows = numrows) # first 100k rows, for speed df_test = df_test_read[features].dropna() df_train = df_train_read[features].dropna() ## Use sklearn to run nearest neighbors k = 1 clf = KNeighborsClassifier(n_neighbors=k) # default distance metric: euclidean clf.fit(df_train[features[0:4]], df_train[features[-1]]) preds = clf.predict(df_test[features[0:4]]) # # Calculate statistics (Root Mean Squared Error, Correlation Coefficient, Mean Absolute Error) print "Calculating statistics" with open(output, "a+") as outputFile: outputFile.write("Ran knn with k={}".format(k) + \ " Trained on {}. Tested on first".format(TRAIN_DATA) + \ " {} rows of {}. Stats:".format(numrows, TRIP_DATA_1)) calcAndLogStats( numpy.array(preds), numpy.array(df_test[features[-1]]), output=output)
def fit_montecarlo_tree(path_index, paths = None, index_filter=None, class_filter=None, feature_filter=None, folds=10): """A diferencia de fit tree, este metodo recibe todos los paths. Entrena solo con uno, indicado por path index. Pero luego por orden, voy abriendo todos los sets para clasificar. """ data = pd.read_csv(paths[path_index], index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: train_X = data.iloc[train_index] train_y = y.iloc[train_index] clf = None clf = tree.Tree('gain', max_depth=10, min_samples_split=20) clf.fit(train_X, train_y) # result = clf.predict_table(test_X, test_y) # results.append(result) # Ahora clasifico con este arbol para todos los datasets for path in paths: data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter) return pd.concat(results)
def load_data(dev_mode=True): '''Loads data: dev_mode=True splits the train set in train and test''' # Load data node_info = pd.read_csv(pth('node_information.xls'), header=None) node_info.columns = ['id', 'date', 'og_title', 'authors', 'journal', 'og_abstract'] train = pd.read_csv(pth('training_set.txt'), sep=' ', header=None) train.columns = ['id1', 'id2', 'link'] test = pd.read_csv(pth('testing_set.txt'), sep=' ', header=None) test.columns = ['id1', 'id2'] # Split train into train and test if dev_mode: prop = 0.75 idx_perm = np.random.permutation(range(len(train))) test = train.iloc[idx_perm[int(len(train)*prop):]] train = train.iloc[idx_perm[:int(len(train)*prop)]] # pre-process node_info if isinstance(node_info.authors.iloc[0], str) or isinstance(node_info.authors.iloc[0], float): node_info.authors = node_info.authors.str.split(', ') node_info.loc[node_info.authors.isnull(), 'authors'] = node_info[node_info.authors.isnull()].apply(lambda x: [], axis=1) return node_info, train, test
def run(): batch_size = 4000 global signatures signatures = get_pickled_signatures() pool = avito_utils.PoolWrapper(processes=4) name = 'ssim' print 'processing train data...' t0 = time() df = pd.read_csv('../input/ItemPairs_train.csv') delete_file_if_exists('features_%s_train.csv' % name) for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): features = process_batch(batch, pool) append_to_csv(features, 'features_%s_train.csv' % name) print 'processing train data took %0.5fs' % (time() - t0) print 'processinig test data...' t0 = time() df = pd.read_csv('../input/ItemPairs_test.csv') delete_file_if_exists('features_%s_test.csv' % name) for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): features = process_batch(batch, pool) append_to_csv(features, 'features_%s_test.csv' % name) print 'processing test data took %0.5fs' % (time() - t0) pool.close()
def load_dataset(path): stores_df = pandas.read_csv('data/store.csv') stores_df = stores_df.fillna(-1) stores_df['StoreType'] = LabelEncoder().fit_transform(stores_df['StoreType']) stores_df['Assortment'] = LabelEncoder().fit_transform(stores_df['Assortment']) # Dropping yields a better performance than: # - Giving each month a boolean column # - Replace the string with a count of the months stores_df = stores_df.drop('PromoInterval', axis=1) annotated_df = pandas.read_csv(path, parse_dates=['Date'], dtype={'StateHoliday': object}) # Dropping yields a better performance than: # - Label encoding annotated_df = annotated_df.drop('StateHoliday', axis=1) # Ugly but fast way to convert Date column to useful, seperate columns ( annotated_df['DayOfWeek'], annotated_df['IsWeekend'], annotated_df['DayOfMonth'], annotated_df['Month'], annotated_df['Year'] ) = zip(*annotated_df['Date'].map(split_date)) annotated_df = annotated_df.drop('Date', axis=1) annotated_df = annotated_df.fillna(-1) # Merging dataset and stores return pandas.merge(annotated_df, stores_df, on='Store', how='inner', sort=False)
def data_collection_stats(): print(check_output(["ls", "../input"]).decode("utf8")) train_images = check_output(["ls", "../input/train_photos"]).decode("utf8") print(train_images[:]) print('time elapsed ' + str((time.time() - config.start_time)/60)) print('Reading data...') train_photos = pd.read_csv('../input/train_photo_to_biz_ids.csv') train_photos.sort_values(['business_id'], inplace=True) train_photos.set_index(['business_id']) test_photos = pd.read_csv('../input/test_photo_to_biz.csv') test_photos.sort_values(['business_id'], inplace=True) test_photos.set_index(['business_id']) train = pd.read_csv('../input/train.csv') train.sort_values(['business_id'], inplace=True) train.reset_index(drop=True) print('Number of training samples: ', train.shape[0]) print('Number of train samples: ', len(set(train_photos['business_id']))) print('Number of test samples: ', len(set(test_photos['business_id']))) print('Finished reading data...') print('Time elapsed: ' + str((time.time() - config.start_time)/60)) print('Reading/Modifying images..') return (train_photos, test_photos, train)
def download_stock_list(self, response): exchange = response.meta['exchange'] path = files_contract.get_security_list_path('stock', exchange) df = pd.read_csv(io.BytesIO(response.body), dtype=str) if df is not None: if os.path.exists(path): df_current = pd.read_csv(path, dtype=str) df_current = df_current.set_index('code', drop=False) else: df_current = pd.DataFrame() df = df.loc[:, ['Symbol', 'Name', 'IPOyear', 'Sector', 'industry']] df = df.dropna(subset=['Symbol', 'Name']) df.columns = ['code', 'name', 'listDate', 'sector', 'industry'] df.listDate = df.listDate.apply(lambda x: to_time_str(x)) df['exchange'] = exchange df['type'] = 'stock' df['id'] = df[['type', 'exchange', 'code']].apply(lambda x: '_'.join(x.astype(str)), axis=1) df['sinaIndustry'] = '' df['sinaConcept'] = '' df['sinaArea'] = '' df = df.set_index('code', drop=False) diff = set(df.index.tolist()) - set(df_current.index.tolist()) diff = [item for item in diff if item != 'nan'] if diff: df_current = df_current.append(df.loc[diff, :], ignore_index=False) df_current = df_current.loc[:, STOCK_META_COL] df_current.columns = STOCK_META_COL df_current.to_csv(path, index=False)
def map_GO_to_GTEX(): inputFilename = '../data/GO_terms_final_gene_counts.txt' GO_list_file = open(inputFilename) GO_list = np.loadtxt(GO_list_file,skiprows=2,usecols=[0],dtype='S10',delimiter='\t') inputFilename = '../data/Tissue_Name_Mappings.csv' tissue_data = pd.read_csv(inputFilename,header=None) map_BTO_to_GTEX = defaultdict(list) for index,row in tissue_data.iterrows(): GTEX_tissue = row[0] BTO_tissues = row[1:] for tissue in BTO_tissues.dropna(): map_BTO_to_GTEX[tissue].append(GTEX_tissue) inputFilename = '../data/BTO_GO.csv' BTO_data = pd.read_csv(inputFilename,skiprows=[0]) map_GO_to_GTEX = defaultdict(list) for index,row in BTO_data.iterrows(): tissue = row[1] if tissue in map_BTO_to_GTEX: GO_IDs = row[2:] for GO_ID in GO_IDs.dropna(): if GO_ID in GO_list: map_GO_to_GTEX[GO_ID] = list(set(map_GO_to_GTEX[GO_ID] + map_BTO_to_GTEX[tissue])) #inputFile.close() return map_GO_to_GTEX
def run(): global mongo, scaler mongo = MongoWrapper(avito_utils.avito_db) scaler = prepare_scaler() batch_size = 8000 name = 'imagemagick' pool = avito_utils.PoolWrapper() t0 = time() df = pd.read_csv('../input/ItemPairs_train.csv') delete_file_if_exists('features_%s_train.csv' % name) print 'read train set, start processing...' for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): batch = process_batch(batch, pool) append_to_csv(batch, 'features_%s_train.csv' % name) print 'processing train set took %0.5fs' % (time() - t0) t0 = time() df = pd.read_csv('../input/ItemPairs_test.csv') delete_file_if_exists('features_%s_test.csv' % name) print 'read test set, start processing...' for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): batch = process_batch(batch, pool) append_to_csv(batch, 'features_%s_test.csv' % name) print 'processing test set took %0.5fs' % (time() - t0) pool.close()
def order_hist(CreateGroupList,num,f): order = pd.read_csv('./B/jdata_user_order.csv', parse_dates=['o_date']) sku = pd.read_csv('./B/jdata_sku_basic_info.csv', ) order = pd.merge(order, sku, on='sku_id', how='left') target_order = order[(order.cate == 101) | (order.cate == 30)].reset_index(drop=True) first_day = datetime.datetime.strptime('2016-08-31 00:00:00', '%Y-%m-%d %H:%M:%S') target_order['o_day_series'] = (target_order['o_date'] - first_day).apply(lambda x: x.days) target_order = target_order.sort_values(by=['user_id','o_day_series'], ascending=False).reset_index(drop=True) alld = [] for CG in CreateGroupList: CreateGroup = CG t = target_order[target_order.o_day_series < CreateGroup] features =[] for i in range(num): t2 = t[['user_id',f]].groupby(['user_id']).shift(-i) t2.columns = t2.columns + '_{}'.format(i) features.append(t2.columns[0]) t = pd.concat([t,t2],axis=1) x = t.drop_duplicates(subset=['user_id']) x = x[['user_id'] + features] x['CreateGroup'] = CreateGroup alld.append(x) df = pd.concat(alld).reset_index(drop=True) # print(np.unique(df.CreateGroup)) return df
def read_input(**kwargs): """ Read CSV-files Parameters ---------- **kwargs : key word arguments Arguments passed from command line Returns ------- nodes_flows : DataFrame Containing data for nodes and flows. nodes_flows_seq: DataFrame Data for sequences. """ nodes_flows = pd.read_csv(kwargs['NODE_DATA'], sep=kwargs['--sep']) nodes_flows_seq = pd.read_csv(kwargs['SEQ_DATA'], sep=kwargs['--sep'], header=None) nodes_flows_seq.dropna(axis=0, how='all', inplace=True) nodes_flows_seq.drop(0, axis=1, inplace=True) nodes_flows_seq = nodes_flows_seq.transpose() nodes_flows_seq.set_index([0, 1, 2, 3, 4], inplace=True) nodes_flows_seq.columns = range(0, len(nodes_flows_seq.columns)) nodes_flows_seq = nodes_flows_seq.astype(float) return nodes_flows, nodes_flows_seq
def main(): df = pd.read_csv("../OUTPUT/segmentation_results_k-means.csv", delimiter=",", skipinitialspace=True) df_api = pd.read_csv("../OUTPUT/usersInfoAPI.csv", delimiter=",", skipinitialspace=True) #aggrego male, female e null df_api["sesso"] = df_api["sesso"].replace("F", "f") df_api["sesso"] = df_api["sesso"].replace("M", "m") df_api["sesso"] = df_api["sesso"].replace("N", "n") df_api["sesso"] = df_api["sesso"].fillna('n') df_friends = pd.read_csv("../OUTPUT/network_degree_node.csv", delimiter=",", skipinitialspace=True) df_merged = pd.merge(df_api, df, left_on="user_id", right_on="user_id", how='right') df_merged = pd.merge(df_friends, df_merged, left_on="user_id", right_on="user_id", how='right') df_merged["sesso"] = df_merged["sesso"].fillna('n') # df_merged["data_reg"] = pd.to_datetime(df_merged['data_reg']) # print df_merged["degree_initial_network"].mean() # generi = df_merged["sesso"].values.tolist() # counter_sex = Counter(generi) # sex_dict = dict(counter_sex) # print sex_dict # # date_time = datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") # # # print datetime.datetime.fromtimestamp(int(df_merged["data_reg"].mean())) # sys.exit() # plt.style.use("dark_background") k_means_analysis(df_merged)
def read_file(self): # get the training data X_train_raw = pd.read_csv(self.file_dir + self.X_train_file) # we're going to do some sampling to get rid of skew in data # first we'll get the row nums where the relevance is in a range of values # <2 (group1); <2.5 & >2 (group2); >2.5 & <3 (group3); == 3 (group4) X_train_g1 = X_train_raw.loc[X_train_raw['relevance'] < 2] X_train_g2 = X_train_raw.loc[(X_train_raw['relevance'] > 1.9) & (X_train_raw['relevance'] < 2.4)] X_train_g3 = X_train_raw.loc[(X_train_raw['relevance'] > 2.6) & (X_train_raw['relevance'] < 3)] X_train_g4 = X_train_raw.loc[X_train_raw['relevance'] == 3] # THEN we take samples based on those (so our final train data is proportional between the ranges) # final samples (w/out replacement) X_train_g2_s = X_train_g2.sample(n = X_train_g1.shape[0], replace=False) X_train_g3_s = X_train_g3.sample(n = X_train_g1.shape[0], replace=False) X_train_g4_s = X_train_g4.sample(n = X_train_g1.shape[0], replace=False) # stack them up: this is our final X_train X_train_comp = X_train_g1.append(X_train_g2_s) X_train_comp.append(X_train_g3_s) X_train_comp.append(X_train_g4_s) self.X_train = X_train_comp.drop(['id', 'product_uid', 'relevance'], axis=1) self.y_train = X_train_comp['relevance'] # get the testing data X_test_raw = pd.read_csv(self.file_dir + self.X_test_file) self.X_test = X_test_raw.drop(['id', 'product_uid'], axis=1) self.fin_df = X_test_raw.drop(['product_uid', 'prod_query_raw_cosine_tfidf', 'prod_query_fixes_cosine_tfidf','des_query_raw_cosine_tfidf','des_query_fixes_cosine_tfidf','kw_matches_overall', 'kw_matches_title', 'kw_matches_des'], axis=1)
def _read_data(self, data): if isinstance(data, pd.core.frame.DataFrame): tax_dta = data elif isinstance(data, str): if data.endswith("gz"): tax_dta = pd.read_csv(data, compression='gzip') else: tax_dta = pd.read_csv(data) else: msg = ('Records.constructor data is neither a string nor ' 'a Pandas DataFrame') raise ValueError(msg) # remove the aggregated record from 2009 PUF tax_dta = tax_dta[tax_dta.recid != 999999] self.dim = len(tax_dta) # create variables in NAMES list for attrname, varname in Records.NAMES: setattr(self, attrname, tax_dta[varname].values) for name in Records.ZEROED_NAMES: setattr(self, name, np.zeros((self.dim,))) self._num = np.ones((self.dim,)) # specify eNNNNN aliases for several pNNNNN and sNNNNN variables self.e22250 = self.p22250 self.e04470 = self.p04470 self.e23250 = self.p23250 self.e25470 = self.p25470 self.e08000 = self.p08000 self.e60100 = self.p60100 self.e27860 = self.s27860 # specify SOIYR self.SOIYR = np.repeat(Records.PUF_YEAR, self.dim)
def create_filtered_matod(city): # read nodes print('Reading nodes') fid = '/home/michael/mit/ods_and_roads/%s/%s_nodes_algbformat.txt'%(city, city) nodes = pd.read_csv(fid, sep=' ') N = nodes.nid.as_matrix() print('Reading MatOD') fid = '/home/michael/mit/ods_and_roads/%s/%s_interod_0_1.txt' %(city, city) matod = pd.read_csv(fid, sep=' ') print('Filtering') o = matod.o.as_matrix() d = matod.d.as_matrix() b = [False] * len(o) c = 0 for k in range(len(o)): if o[k] in N and d[k] in N: b[k] = True c += 1 print('Number of excluded edges %d of %d' %(len(o) - c, len(o))) matod = matod[b] print('Saving file') fid = '/home/michael/mit/instances/tables/%s_table_od.csv' % city matod.to_csv(fid, sep=' ', index=False) print('Done')
def test_spread_2(self): input_df = DplyFrame(pd.read_csv(StringIO("""country,year,key,value 1,Afghanistan,1999,cases,745 2,Afghanistan,1999,population,19987071 3,Afghanistan,2000,cases,2666 4,Afghanistan,2000,population,20595360 5,Brazil,1999,cases,37737 6,Brazil,1999,population,172006362 7,Brazil,2000,cases,80488 8,Brazil,2000,population,174504898 9,China,1999,cases,212258 10,China,1999,population,1272915272 11,China,2000,cases,213766 12,China,2000,population,1280428583"""))) input_pd = DplyFrame(pd.read_csv(StringIO("""country,year,cases,population Afghanistan,1999,745,19987071 Afghanistan,2000,2666,20595360 Brazil,1999,37737,172006362 Brazil,2000,80488,174504898 China,1999,212258,1272915272 China,2000,213766,1280428583"""))) spread_test_df_1 = input_df >> spread(X.key, X.value) spread_test_df_2 = spread(input_df, X.key, X.value) spread_test_df_3 = input_df >> group_by(X.key) >> spread(X.key, X.value) self.assertTrue(input_pd.equals(spread_test_df_1)) self.assertTrue(input_pd.equals(spread_test_df_2)) self.assertTrue(input_pd.equals(spread_test_df_3))
def predict(): converters = dict(DRUNK_DR=convertDD, RAIL=convertRAIL, TWAY_ID=convertTWAYID) acc_train_df = pandas.read_csv('accident_train.csv', converters=converters) acc_train_df = acc_train_df.fillna(0) acc_test_df = pandas.read_csv('accident_test.csv', converters=converters) acc_test_df = acc_test_df.fillna(0) ids = acc_test_df['ID'].get_values() print "CSVs read in" columns = list(acc_train_df.columns) for c in columns_to_remove: print c columns.remove(c) columns.remove("YEAR") # test data doesn't have this key for some reason labels = acc_train_df['DRUNK_DR'].get_values() data_train = acc_train_df[columns] acc_test_df = acc_test_df[columns] xgtrain = xgboost.DMatrix(data_train, label=labels) xgtest = xgboost.DMatrix(acc_test_df) watchlist = [(xgtrain, 'train')] bst = xgboost.train(params, xgtrain, num_rounds, watchlist) preds = modifyPreds(bst.predict(xgtest)) with open('submission.csv', 'w') as f: f.write("ID,DRUNK_DR\n") for i, id_ in enumerate(ids): f.write("{},{}\n".format(id_, preds[i]))
def test_semi_join_dplyr_2(self): # bivariate keys j_test_1 = self.c >> semi_join(self.d) j_test_2 = self.d >> semi_join(self.c) j_pd_1 = DplyFrame(pd.read_csv(StringIO("""x,y,a 1,1,1 1,1,2 2,2,3"""))) j_pd_2 = DplyFrame(pd.read_csv(StringIO("""x,y,b 1,1,1 2,2,2 2,2,3"""))) self.assertTrue(j_test_1.equals(j_pd_1)) self.assertTrue(j_test_2.equals(j_pd_2)) # include column names j_test_1 = self.c >> semi_join(self.d, by=['x', 'y']) j_test_2 = self.d >> semi_join(self.c, by=['x', 'y']) self.assertTrue(j_test_1.equals(j_pd_1)) self.assertTrue(j_test_2.equals(j_pd_2)) # use different column names alt_c = self.c.rename(columns={'x': 'x_2'}) j_test_1 = alt_c >> semi_join(self.d, by=[('x_2', 'x'), 'y']) j_test_2 = self.d >> semi_join(alt_c, by=[('x', 'x_2'), 'y']) j_pd_1 = DplyFrame(pd.read_csv(StringIO("""x_2,y,a 1,1,1 1,1,2 2,2,3"""))) self.assertTrue(j_test_1.equals(j_pd_1)) self.assertTrue(j_test_2.equals(j_pd_2))
def test_anti_join_dplyr_2(self): # bivariate keys j_test_1 = self.c >> anti_join(self.d) j_test_2 = self.d >> anti_join(self.c) j_pd_1 = DplyFrame(pd.read_csv(StringIO("""index,x,y,a 3,3,3,4""")).set_index(['index'])) j_pd_1.index.name = None j_pd_2 = DplyFrame(pd.read_csv(StringIO("""index,x,y,b 3,4,4,4""")).set_index(['index'])) j_pd_2.index.name = None self.assertTrue(j_test_1.equals(j_pd_1)) self.assertTrue(j_test_2.equals(j_pd_2)) # use column names j_test_1 = self.c >> anti_join(self.d, by=['x', 'y']) j_test_2 = self.d >> anti_join(self.c, by=['x', 'y']) self.assertTrue(j_test_1.equals(j_pd_1)) self.assertTrue(j_test_2.equals(j_pd_2)) # use different column names alt_c = self.c.rename(columns={'x': 'x_2'}) j_test_1 = alt_c >> anti_join(self.d, by=[('x_2', 'x'), 'y']) j_test_2 = self.d >> anti_join(alt_c, by=[('x', 'x_2'), 'y']) j_pd_1 = DplyFrame(pd.read_csv(StringIO("""index,x_2,y,a 3,3,3,4""")).set_index(['index'])) j_pd_1.index.name = None self.assertTrue(j_test_1.equals(j_pd_1)) self.assertTrue(j_test_2.equals(j_pd_2))
def get_ticks_by_date(self, symbol, begin_date, end_date, hours="regular", parse_dates=False, nrows=None): dates = self.parse_dates(begin_date, end_date) suffix = self.get_file_suffix(hours) filenames = [symbol + s + ".csv.gz" for s in suffix] if parse_dates: tick_data = pd.DataFrame(columns=["type", "price", "size", "exch", "cond"]) else: tick_data = pd.DataFrame(columns=["datetime", "type", "price", "size", "exch", "cond"]) for date in dates: for filename in filenames: data_path = os.path.join(self.base_dir, date, filename) if not os.path.exists(data_path): continue #print "cannot find", data_path #raise IOException("Data file not found: %s" % data_path) if parse_dates: dateparse = lambda x: pd.datetime.strptime(x+"000", '%m/%d/%Y %H:%M:%S.%f') cur_ticks = pd.read_csv(data_path, compression="gzip", names=["datetime", "type", "price", "size", "exch", "cond"], parse_dates=[0], date_parser=dateparse, index_col=0, nrows=nrows) # cur_ticks = pd.read_csv(data_path, compression="gzip", names=["datetime", "type", "price", "size", "exch", "cond"], parse_dates=[0], index_col=0, nrows=nrows) else: cur_ticks = pd.read_csv(data_path, compression="gzip", names=["datetime", "type", "price", "size", "exch", "cond"], nrows=nrows) tick_data = tick_data.append(cur_ticks) return tick_data
def load_annotations(self): self.num_annotators = 0 self.annotations = [] self.locations = [] self.targets = None targets_file_name = os.path.join(self.path, 'targets.csv') if os.path.exists(targets_file_name): self.targets = pd.read_csv(targets_file_name) while True: annotation_filename = "{}/annotations_{}.csv".format(self.path, self.num_annotators) location_filename = "{}/location_{}.csv".format(self.path, self.num_annotators) if not os.path.exists(annotation_filename): break self.annotations.append(pd.read_csv(annotation_filename)) self.locations.append(pd.read_csv(location_filename)) self.num_annotators += 1 self.annotations_loaded = self.num_annotators != 0
def main(): # Get the data and targets df = pd.read_csv('train1.csv') df = df[df.rating != 'rating'] corpus = [review for review in df.review] splitPoint = len(corpus)*2/3 trainingCorpus = corpus[:splitPoint] testCorpus = corpus[splitPoint:] target = [rating for rating in df.rating] trainingTarget = np.array(target[:splitPoint]) testTarget = np.array(target[splitPoint:]) # Train the algorithm train_X, vocabList = createVectorizer(trainingCorpus, 'None', True) NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget) # Test the algorithm test_X = createVectorizer(testCorpus, vocabList, True) test_predict = NB_Bern_model.predict(test_X) print(np.mean(test_predict == testTarget)) print metrics.classification_report(testTarget, test_predict, target_names=['0', '1']) # Make Predictions predict_df = pd.read_csv('test2.csv') predictCorpus = [review for review in predict_df.review] member = [memberid for memberid in predict_df.ID] predict_X = createVectorizer(predictCorpus, vocabList, True) predictions = NB_Bern_model.predict(predict_X) predict_df.columns = ['ID', 'Predicted'] for i in range(len(member)): predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i] predict_df.to_csv('submission1.csv', sep = ',', index=False)
from sklearn.naive_bayes import MultinomialNB, GaussianNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import f1_score, accuracy_score def run_naive_bayes(X_train, y_train, X_test, y_test, _alpha=0.5): # clf = MultinomialNB(alpha=_alpha) clf = GaussianNB() clf.fit(X_train, y_train) predictions_count = clf.predict(X_test) print("f1 score: ", f1_score(y_test, predictions_count)) print("accuracy score: ", accuracy_score(y_test, predictions_count)) if __name__ == "__main__": # read data df_train = pd.read_csv("../data/train_opt.csv", sep=',') df_train['Comment'] = df_train['Comment'].fillna(' ') df_test = pd.read_csv("../data/test_opt.csv", sep=',') df_test['Comment'] = df_test['Comment'].fillna(' ') # labels y_train = df_train['Insult'] y_test = df_test['Insult'] count_vectorizer = CountVectorizer(min_df=3) X_train = count_vectorizer.fit_transform(df_train['Comment']) X_test = count_vectorizer.transform(df_test['Comment']) run_naive_bayes(X_train.toarray(), y_train, X_test.toarray(), y_test, 1)
from sklearn.linear_model import LassoCV from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.linear_model import LinearRegression from sklearn.linear_model import ElasticNetCV from sklearn.neural_network import MLPRegressor from sklearn.svm import SVR from sklearn.metrics import mean_absolute_error print("AIRFOIL SELF-NOISE") names = ['Frequency','Angle-Attack','Chord-Length','Free-stream-velocity','Suction-thickness','SSPresure-level'] data = pd.read_csv('./datos/airfoil_self_noise.dat',names = names,sep="\t") print("PREPROCESADO") print("Matriz de correlación") corr_matrix = data.corr() k = 6 # Número de variables en el heatmap cols = corr_matrix.nlargest(k, 'Frequency')['Frequency'].index cm = np.corrcoef(data[cols].values.T) plt.subplots(figsize=(9,9)) sns.set(font_scale=0.75) hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values) plt.show()
import numpy as np import pandas as pd import tensorflow as tf import nltk import pickle from nltk.corpus import stopwords train = pd.read_csv('conversation/data/train.csv') x_train = train.iloc[:, 0].values y_train = train.iloc[:, 1:2].values all_stopwords = stopwords.words('english') all_stopwords.remove('not') from nltk.stem.porter import PorterStemmer import re stemmer = PorterStemmer() corpus = [] for i in x_train: text = re.sub('[^a-zA-Z]', ' ', i) text = text.lower() text = text.split() text = [stemmer.stem(word) for word in text if word not in set(all_stopwords)] text = ' '.join(text) corpus.append(text) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() x_train = cv.fit_transform(corpus) pickle.dump(cv, open('conversation/save/count_vectorizer.pickle', 'wb'))
import optuna from sklearn.metrics import log_loss import lightgbm as lgb import pandas as pd from sklearn.model_selection import train_test_split import numpy as np train = pd.read_csv('../titanic/train.csv') test = pd.read_csv('../titanic/test.csv') sub = pd.read_csv('../titanic/gender_submission.csv') data = pd.concat([train, test], sort=False) data['Sex'].replace(['male', 'female'], [0, 1], inplace=True) data['Embarked'].fillna('S', inplace=True) data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int) data['Fare'].fillna(np.mean(data['Fare']), inplace=True) age_avg = data['Age'].mean() age_std = data['Age'].std() data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True) delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin'] data.drop(delete_columns, axis=1, inplace=True) print(data.head()) train = data[:len(train)] test = data[len(train):] X_train = train.drop('Survived', axis=1) X_test = test.drop('Survived', axis=1)
#PROCESAMIENTO import time import os import pandas as pd import numpy as np from matplotlib import pyplot as plt import seaborn as sns plt.style.use('fivethirtyeight') import warnings warnings.filterwarnings('ignore') #CARGAR LOS DATOS os.getcwd() os.chdir("C://Users//Sony//Desktop//TESIS 2") df = pd.read_csv('CIC_AWS_Filtrado.csv') df.head(10) #PREPARAR LOS DATOS from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score, cross_val_predict, KFold from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score from sklearn import tree import sklearn.metrics from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix from sklearn.metrics import classification_report from sklearn import metrics df1 = df[['Dst_Port','Protocol','Flow_Duration','Tot_Fwd_Pkts','Tot_Bwd_Pkts', 'TotLen_Fwd_Pkts','TotLen_Bwd_Pkts','Fwd_Pkt_Len_Mean','Fwd Pkt Len Max','Fwd Pkt Len Min',
import numpy as np import pandas as pd def checksum(m): rows = len(m) cols = len(m[0]) diff_sum = 0 for i in range(rows): min = 1000000 max = 0 for j in range(cols): if m[i, j] <= min: min = m[i, j] if m[i, j] >= max: max = m[i, j] diff_sum += abs(max - min) return diff_sum csvfile = pd.read_csv("day2_input.csv", sep="\t", header=None) m = np.array(csvfile) print("The matrix is: ") print(m) answer = checksum(m) print("The checksum is: " + str(answer))
"""Simulation file used to run the model""" import time from spillover_model_calRA import * from spillover_model import * from calibration_functions import * import pandas as pd from stochasticprocess import * import matplotlib.pyplot as plt from scipy.optimize import minimize import math df_inflation = pd.read_csv('C:\Users\jrr\Dropbox\International Spillovers\Data\inflation\CPI_96.csv') df_interest= pd.read_excel('C:\Users\jrr\Dropbox\International Spillovers\Data\interest_rates\deposit_rates.xls',sheet_name='data') df_penn= pd.read_excel('C:\Users\jrr\Dropbox\International Spillovers\Data\inflation\penworldtable90.xlsx', Sheet_name="Sheet5") df_penn = df_penn.drop(df_penn[df_penn.year != 2014].index) ROW_countries = ['Argentina', 'Australia', 'Bermuda', 'Botswana', 'Brazil', 'Canada', 'Chile', 'China', 'Colombia', 'Czech Republic', 'Denmark', 'HongKong', 'Hungary', 'India', 'Indonesia', 'Israel', 'Japan', 'Kuwait', 'Lebanon', 'Liechtenstein', 'Malaysia', 'Mexico', 'Monaco', 'Namibia', 'New Zealand', 'Norway', 'Oman', 'Pakistan', 'Peru', 'Philippines', 'Puerto Rico', 'Poland', 'Russia', 'Singapore', 'South Africa', 'Korea', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'United Kingdom', 'United States', 'Venezuela', 'Vietnam'] inflation = {df_inflation.iloc[i][0]:df_inflation.iloc[i][20] for i in range(len(df_inflation))} interest = {df_interest.iloc[i][0]:df_interest.iloc[i][3] for i in range(len(df_interest))} rgdp = {df_penn.iloc[i][1]: df_penn.iloc[i][5] for i in range(len(df_penn))} inf = {} int = {} gdp = {} for i in ROW_countries: try: if math.isnan(float(inflation[i])) != True:
def simulatedata(self): nr.seed(seed=79819) plt.close('all') QUIC20 = pd.read_csv(fname) #startTime = pd.Timestamp(dt.datetime(2014, 07, 05, 12, 00, 20)) #endTime = pd.Timestamp(dt.datetime(2014, 07, 05, 18, 00, 00)) startTime = pd.Timestamp(dt.datetime(2014, 11, 01, 00, 00, 00)) endTime = pd.Timestamp(dt.datetime(2014, 11, 31, 00, 00, 00)) TimeStamp = pd.date_range(startTime, endTime, freq='20s') QUIC20['dateTime'] = pd.date_range(startTime, endTime, freq='20s') QUIC20.BaseSim = QUIC20.BaseSim * 10**7 QUIC20.RemoteSim = QUIC20.RemoteSim * 10**7 QUIC20.index = QUIC20.dateTime QUIC = QUIC20.resample('1s', fill_method='pad') QUIC['dateTime'] = QUIC.index.copy() rawdat = importSPOD(datafolder, 1, startTime, endTime) QUIC['U'] = rawdat['U'] QUIC['V'] = rawdat['V'] QUIC['WS'] = rawdat['WS'] QUIC['Time'] = pd.to_datetime( QUIC.index.copy()).astype('int').astype(float) / (10**18) # Simulate Data Num = len(QUIC) QUIC['BaseBM'] = genBrownianBridge(Num) + 1.5 NewBase = QUIC.BaseSim + QUIC.BaseBM NewBase[NewBase > 5] = 5 NewBase[NewBase < 0.25] = nr.randn(len( NewBase[NewBase < 0.25])) * .05 + 0.25 QUIC['Base'] = NewBase QUIC['RemoteBM'] = genBrownianBridge(Num) NewRemote = QUIC.RemoteSim + QUIC.RemoteBM NewRemote[NewRemote > 5] = 5 NewRemote[NewRemote < 0.25] = nr.randn(len( NewRemote[NewRemote < 0.25])) * .05 + 0.25 QUIC['Remote'] = NewRemote # Plot Simulated Data font = {'weight': 'bold', 'size': 8} mpl.rc('font', **font) baseTotal = ggplot(aes(x='dateTime', y='Base'), data=QUIC) +\ geom_line() +\ ylim(0,5) +\ geom_line() + xlab("") + ylab("Simulated Signal (V)") baseRand = ggplot(aes(x='dateTime', y='BaseBM'), data=QUIC) +\ geom_line() + xlab("") + ylab("Stochastic Baseline") baseSim = ggplot(aes(x='dateTime', y='BaseSim'), data=QUIC) +\ geom_line()+\ ylim(0,5) + xlab("") + ylab("Simulated Signal (V)") remoteTotal = ggplot(aes(x='dateTime', y='Remote'), data=QUIC) +\ geom_line() + xlab("") + ylab("Simulated Signal (V)") # theme_matplotlib(mpl.rc('font', **font), matplotlib_defaults=False) ggsave(plot=baseTotal, filename=figfolder + 'BaseTotal.png', width=8, height=2) ggsave(plot=baseRand, filename=figfolder + 'BaseRand.png', width=8, height=2) ggsave(plot=baseSim, filename=figfolder + 'BaseSim.png', width=8, height=2) ggsave(plot=remoteTotal, filename=figfolder + 'remoteTotal.png', width=8, height=2) # Illustrate Method fitMinSpline(QUIC['Base'][QUIC.index.min():QUIC.index.min() + pd.Timedelta(freqT, 'h')], QUIC['Time'][QUIC.index.min():QUIC.index.min() + pd.Timedelta(freqT, 'h')], smoothingWindow, plot=True, plotVar=QUIC.dateTime) ggsave(filename=figfolder + 'Spline_fit.png', width=8, height=2) QUICFilt = applyFilters( QUIC[QUIC.index.min():QUIC.index.min() + pd.Timedelta(freqT, 'h')], thresh1, thresh2, smoothingWindow) butterplot = ggplot(aes(x='dateTime', y='butterBase'), data=QUICFilt) + geom_line() +\ ylim(0,5) +\ xlab('') + ylab('Sensor after Butterworth') ggsave(plot=butterplot, filename=figfolder + 'Butterworth_filt.png', width=8, height=2) # Apply algorithm QUIC['TrueBase'] = QUIC.BaseSim.apply(isSignal, args=(0.01, )) QUIC['TrueRemote'] = QUIC.RemoteSim.apply(isSignal, args=(0.01, )) FiltAvg = piecewiseImportSpod(startTime, endTime, freq, avgTime, thresh1, thresh2, smoothingWindow, QUIC, True) remoteTotal = ggplot(aes(x='dateTime', y='Remote'), data=QUIC) +\ geom_line() +\ theme_matplotlib(mpl.rc('font', **font), matplotlib_defaults=False) TrueVDetect = ggplot(aes(x='TrueBase', y='butterBaseSignal'), data=FiltAvg) +\ geom_point(color = 'blue') +\ geom_point(aes(x = 'TrueRemote', y='butterRemoteSignal'), color = 'blue') +\ geom_point(aes(y='splineBaseSignal'), color='green') +\ geom_point(aes(x = 'TrueRemote', y='splineRemoteSignal'), color='green') +\ geom_abline(aes(intercept = 0, slope=1)) +\ ylab('Detected Signal 5 min mean') +\ xlab('True Signal 5 min mean') ggsave(plot=TrueVDetect, filename=figfolder + 'TrueVDetect.png', width=4.5, height=4) ButterCorrect = (len(FiltAvg[(FiltAvg.butterBaseSignal > 0.017) & (FiltAvg.TrueBase > 0.017)]) + len(FiltAvg[(FiltAvg.butterRemoteSignal > 0.017) & (FiltAvg.TrueRemote > 0.017)])) / ( 2.0 * len(FiltAvg)) print("Butter percent correct: " + str(ButterCorrect)) SplineCorrect = (len(FiltAvg[(FiltAvg.splineBaseSignal > 0.017) & (FiltAvg.TrueBase > 0.017)]) + len(FiltAvg[(FiltAvg.splineRemoteSignal > 0.017) & (FiltAvg.TrueRemote > 0.017)])) / ( 2.0 * len(FiltAvg)) print("Spline percent correct: " + str(SplineCorrect)) ButterFalsePos = (len(FiltAvg[(FiltAvg.butterBaseSignal > 0.017) & (FiltAvg.TrueBase < 0.017)]) + len(FiltAvg[(FiltAvg.butterRemoteSignal > 0.017) & (FiltAvg.TrueRemote < 0.017)])) / ( 2.0 * len(FiltAvg)) print("Butter percent false pos: " + str(ButterFalsePos)) SplineFalsePos = (len(FiltAvg[(FiltAvg.splineBaseSignal > 0.017) & (FiltAvg.TrueBase < 0.017)]) + len(FiltAvg[(FiltAvg.splineRemoteSignal > 0.017) & (FiltAvg.TrueRemote < 0.017)])) / ( 2.0 * len(FiltAvg)) print("Spline percent false pos: " + str(SplineFalsePos))
def get_precip_data(): return pd.read_csv('precipitation.csv', parse_dates=[2])
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}") log_path.mkdir(parents=True, exist_ok=True) mid_path = Path(f"../mid/{DATA_VERSION}_{TRIAL_NO}{debug_str}") mid_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading print("start data loading") # train = unpickle("../processed/v003/v003_098/train_compact_v003_098.pkl") # test = unpickle("../processed/v003/v003_098/test_compact_v003_098.pkl") train = unpickle( "../processed/v003/v003_104/train_compact_v003_104_compact.pkl") test = unpickle("../processed/v003/v003_104/test_compact_v003_104_compact.pkl") train_ = pd.read_csv("../input/train.csv") train_id = train_.id mol_name = train_.molecule_name scalar_coupling_constant = train_.scalar_coupling_constant scalar_coupling_contributions = pd.read_csv( f'../input/scalar_coupling_contributions.csv') fc = scalar_coupling_contributions.fc del train_ del scalar_coupling_contributions # feat_train = unpickle("../processed/v003/atom_3J_substituents1_train_na.pkl") # feat_test = unpickle("../processed/v003/atom_3J_substituents1_test_na.pkl") # train = pd.concat([train, feat_train], axis=1) # test = pd.concat([test, feat_test], axis=1) # assert len(train) == 4658147 # assert len(test) == 2505542
from datetime import datetime import os import tensorflow from sklearn import preprocessing from sklearn.model_selection import train_test_split from tensorflow.keras.layers import Conv1D, MaxPooling1D from tensorflow.keras.layers import Flatten, Dropout, Activation import matplotlib.pyplot as plt from tensorflow.keras.models import model_from_json dfData = pd.read_csv('drive/My Drive/VoiceData/all_Data.csv') dfData.head() X = dfData.loc[:, dfData.columns != 'label'] y = dfData['label'] lb = preprocessing.LabelEncoder() y = lb.fit_transform(y) from sklearn.preprocessing import RobustScaler scaler = RobustScaler() scaler.fit(X) x = scaler.transform(X)
from scipy.stats import gaussian_kde def get_lng(x): lng = re.findall("\d+.\d+",x)[0] return lng def get_lat(x): lat = re.findall("\d+.\d+",x)[1] return lat if __name__ == "__main__": df = pd.read_csv("failure_report.csv") lng_ = df["经纬度"].apply(lambda x : get_lng(x)) lat_ = df["经纬度"].apply(lambda x : get_lat(x)) x = lng_.astype(np.float64).values y = lat_.astype(np.float64).values ''' longitude range: 120.65961082469951 120.86540172553127 latitude range: 31.24660722778891 31.42576791825509 # 事故分布散点图 plt.scatter(x,y) plt.xlim(120.65961082469951,120.86540172553127) plt.ylim(31.24660722778891,31.42576791825509) plt.show() '''
from dyneusr import DyNeuGraph from dyneusr.tools import visualize_mapper_stages # Fetch dataset, extract time-series from ventral temporal (VT) mask dataset = fetch_haxby() masker = NiftiMasker( dataset.mask_vt[0], standardize=True, detrend=True, smoothing_fwhm=4.0, low_pass=0.09, high_pass=0.008, t_r=2.5, memory="nilearn_cache" ) X = masker.fit_transform(dataset.func[0]) # Encode labels as integers df = pd.read_csv(dataset.session_target[0], sep=" ") target, labels = pd.factorize(df.labels.values) y = pd.DataFrame({l:(target==i).astype(int) for i,l in enumerate(labels)}) # Generate shape graph using KeplerMapper mapper = KeplerMapper(verbose=1) lens = mapper.fit_transform(X, projection=TSNE(2, random_state=1)) graph = mapper.map( lens, X=X, cover=Cover(20, 0.5), clusterer=DBSCAN(eps=20.) ) # Visualize the shape graph using DyNeuSR's DyNeuGraph dG = DyNeuGraph(G=graph, y=y) dG.visualize('dyneusr4D_haxby_decoding.html', template='4D', static=True, show=True)
counts.columns.name = 'month' return totals, counts def main(): data = get_precip_data() totals, counts = pivot_months_loops(data) totals.to_csv('totals.csv') counts.to_csv('counts.csv') np.savez('monthdata.npz', totals=totals.values, counts=counts.values) if __name__ == '__main__': main() totals = pd.read_csv('totals.csv').set_index(keys=['name']) counts = pd.read_csv('counts.csv').set_index(keys=['name']) # Recreating totals data = get_precip_data() totals2, counts2 = pivot_months_pandas(data) print(totals2) print(totals) print(counts2) print(counts)
import pickle import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from EmotionClassifer import EmotionClassifier root_dir = "/users/imishra/workspace/EmotionDetection" # Read the data data_raw = pd.read_csv(root_dir+'/data/isear.csv', error_bad_lines=False, sep="|") data = pd.DataFrame({'content': data_raw['SIT'], 'sentiment': data_raw['Field1']}) # Clean and transform the data max_num_words = 4000 max_text_length = 1000 embed_dim = 128 lstm_units = 128 emotionClassifier = EmotionClassifier() data['content'] = data['content'].apply(EmotionClassifier.clean_text) data['sentiment_label'] = [emotionClassifier.emotions_labels_map[sentiment] for sentiment in data['sentiment']] # Create and train the model emotionClassifier.create_tokenizer(data['content'], max_num_words, max_text_length) feature_vectors = emotionClassifier.map_features(data['content']) labels = np.array(data['sentiment_label']).reshape(-1, 1) emotionClassifier.fit_label_encoder(labels) labels = emotionClassifier.encode_labels(labels) emotionClassifier.create_model(embed_dim, lstm_units) emotionClassifier.compile_model(loss_function='categorical_crossentropy', optimizer='rmsprop', metrics='accuracy') X_train, X_valid, Y_train, Y_valid = train_test_split(feature_vectors, labels, test_size=0.2, random_state=42) emotionClassifier.train(X_train, Y_train, X_valid, Y_valid, batch_size=128, epochs=30, verbose=2)
import pandas as pd import os import numpy as np from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import MultiLabelBinarizer data = [] outputs_dir = os.path.join(os.getcwd(), "outputs") for file in os.listdir(outputs_dir): df = pd.read_csv(os.path.join(outputs_dir, file)) df = df.dropna() df = df.drop(["Frame Number", "time_relative"], axis=1) df['IMSI'] = df['IMSI'].astype(str) df['enb_ue_s1ap_id'] = df['enb_ue_s1ap_id'].astype(str) df['mme_ue_s1ap_id'] = df['mme_ue_s1ap_id'].astype(str) df = pd.concat( [df.drop('protocols', 1), df['protocols'].str.get_dummies(sep="|")], 1) df = pd.concat( [df.drop('cellidentity', 1), df['cellidentity'].str.get_dummies()], 1) df = pd.concat( [df.drop('enb_ue_s1ap_id', 1), df['enb_ue_s1ap_id'].str.get_dummies()], 1) df = pd.concat( [df.drop('mme_ue_s1ap_id', 1), df['mme_ue_s1ap_id'].str.get_dummies()],
##################################################### # Initial Set Up # for dash and plotting capabilities import dash import dash_core_components as dcc # for accessing interactive data visualization with plotly.js import dash_html_components as html # for accessing html elements h1 h2 import plotly.graph_objs as go # for designing Chloropleth map # for reading in data import pandas as pd import json # read in csv file for data analysis df = pd.read_csv('../data_set/M_Landings_cleaned.csv') print(df.head(10)) # Read in geojson data with open('../data_set/coordinates.json', 'r') as json_data: df_coordinates = json.load(json_data) print(type(df_coordinates)) # print(df_coordinates['features'][:]) # mapbox token for mapping choropleth map mapbox_accesstoken = 'pk.eyJ1IjoiY3JhaWdtYXJpYW5pIiwiYSI6ImNrNTMyM2l4MDA0NHMzbHF2NTI0aHdoMzQifQ.l4cSBnBuWaV49cs1XF4MoA' ################################################################## # Create plotly figure # for names in our bar chart meteors = df['name'].str.title().tolist()
import matplotlib.pyplot as plt import sklearn from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split import skl2onnx from skl2onnx import convert_sklearn from skl2onnx.common.data_types import FloatTensorType, StringTensorType from skl2onnx.common.data_types import Int64TensorType titanic_url = ('https://raw.githubusercontent.com/amueller/' 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv') data = pd.read_csv(titanic_url) X = data.drop('survived', axis=1) y = data['survived'] print(data.dtypes) # SimpleImputer on string is not available for # string in ONNX-ML specifications. # So we do it beforehand. for cat in ['embarked', 'sex', 'pclass']: X[cat].fillna('missing', inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) numeric_features = ['age', 'fare'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')),
import pandas as pd import sys import numpy as np import gc def mag(df): return np.linalg.norm(df[['x','y','z']]) def is_active(res): return res['mag_diff']['var'] > 1e-07 f = sys.argv[1] df_o = pd.read_csv(f) df_o.set_index(pd.to_datetime(df_o['timestamp']), inplace=True) df_o.sort_values(by='timestamp', inplace=True) dfg = df_o.groupby(pd.TimeGrouper('D')) not_wearing_times = [] for df in dfg: df = df[1] df = df.rolling('480s').mean() df.dropna(inplace=True) df.columns = ['x', 'y','z'] df=df.resample('480s').mean() try: df['mag'] =df.apply(mag, axis=1) df['mag_diff'] = df['mag'].diff() df.dropna(inplace=True) res = df.groupby(pd.Grouper(freq='24Min')).agg(['var']) res['wearing'] =res.apply(is_active, axis=1)
def cal_dist(lon1, lat1, lon2, lat2): lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) dlon = lon2 - lon1 dlat = lat2 - lat1 a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2 c = 2 * asin(sqrt(a)) distance = 6378.137 * c return distance food = '/Users/molly/Documents/NUS/2ndSemester/Projects/CS5224/Cents_trip/dataset/food.csv' airbnb = '/Users/molly/Documents/NUS/2ndSemester/Projects/CS5224/Cents_trip/dataset/airbnb.csv' food_df = pd.read_csv(food) airbnb_df = pd.read_csv(airbnb) food_data = food_df.iloc[:, [0, 6, 7]] airbnb_data = airbnb_df.iloc[:, [0, 2, 3]] foodid = food_data['FOODID'].as_matrix() #print(type(foodid[0])) lat_food = food_data['LATITUDE'].as_matrix() lng_food = food_data['LONGITUDE'].as_matrix() roomid = airbnb_data['ROOMID'].as_matrix() #print(type(roomid[0])) lat_airbnb = airbnb_data['LATITUDE'].as_matrix() lng_airbnb = airbnb_data['LONGITUDE'].as_matrix() distances = []
import pandas as pd import geopandas as gpd import matplotlib.pyplot as plt import pysal as ps #world = gpd.read_file(gpd.datasets.get_path()) us_income = pd.read_csv(ps.examples.get_path('usjoin.csv')) print(us_income) us_income_shape = gpd.read_file(ps.examples.get_path('us48.shx')) #us_income_shape.plot()
print('Exception: ', ex) print('Total API Calls: ', count) break # Get 2000 township census data # st_cnty_fips_00 = pd.read_csv('/Users/salma/Studies/Research/Criminal_Justice/research_projects/main_census_merge/data/wip_merge_files/st_cnty_fips_2000.csv') # get_census_data_from_api('https://api.census.gov/data/2000/sf1', st_cnty_fips_00, 'new_census_townships_00_initial') # 3141 calls # Get 2010 township census data # st_cnty_fips_10 = pd.read_csv('/Users/salma/Studies/Research/Criminal_Justice/research_projects/main_census_merge/data/wip_merge_files/st_cnty_fips_10_temp.csv') # get_census_data_from_api('https://api.census.gov/data/2010/dec/sf1', st_cnty_fips_10, 'new_census_townships_10_initial_16th', 2010) # Get 2010 township census data fips_90 = pd.read_csv( '/Users/salma/Studies/Research/Criminal_Justice/research_projects/US_Crime_Analytics/data/wip_merge_files/st_cnty_fips_1990.csv' ) #st_cnty_fips_90 = fips_90[fips_90['county'].isnull] #get_census_data_from_api('https://api.census.gov/data/1990/sf1', fips_90, 'new_census_for_tships_90_total_pop', 1990) """ 16 files for 2010 census due to the limitations on # of API calls per hour. Hence need to iterate over the files in township_10 folder and concatenate all to the 1st file """ def create_final_twnshp_file(twnshp_dir, first_file): # Read the initial file twnshp_1st_file_df = pd.read_csv(first_file) # Change to the twnshp cen dir os.chdir(twnshp_dir)
import pandas as pd import matplotlib.pyplot as plt dataset = pd.read_csv('/Users/sledro/Desktop/LondonCrime/Datasets with access/London-street.csv') #Drop columns indexed 0,2,3,7,8,11 as invaluable data dataset.drop(dataset.columns[[0,2,3,7,8,11]], axis=1, inplace=True) #Drop NaN's NaNsRemovedAndColsDropped = dataset.dropna(axis=0, how='any') #Print first 5 rows ###print(NaNsRemovedAndColsDropped.head()) #Add data frame to json file to allow Firebase upload #NaNsRemovedAndColsDropped.to_csv('/Users/sledro/Desktop/LondonCrime/Datasets with access/Cleaned.csv') # https://github.com/firebase/firebase-import # firebase-import --database_url https://londoncrimepredictor.firebaseio.com/ --path / --json Cleaned.json #res = pd.read_json('/Users/sledro/Desktop/LondonCrime/Datasets with access/Cleaned.json', orient='records') #print(res.head()) colors = ['#105B63', '#FFFAD5','#FFD34E','#DB9E36','#BD4932'] plot1 = NaNsRemovedAndColsDropped.groupby('Month').size().reset_index(name='number of outcomes').set_index('Month') plot1 plot1.plot(kind="line",figsize=(20,10), linestyle='--', marker='o',color=colors) plt.show()
import matplotlib.pyplot as plt import os import glob import pandas as pd from tqdm import tqdm import numpy as np configs_folder = r"/datadrive/configs" all_exps = os.listdir(configs_folder) for exp in tqdm(all_exps): if os.path.exists(os.path.join(configs_folder, exp, 'training')): #and not os.path.exists(os.path.join(configs_folder, exp, 'progress_graph.png')): try: df = pd.read_csv(os.path.join(configs_folder, exp, 'training')) if any(np.isinf(df).all()): print(f'found column all infs in {exp}') elif not all(np.isfinite(df).all()): print(f'found column some infs in {exp}') fig, axs = plt.subplots(2,1, sharex=True, figsize=(15,10)) x = list(range(len(df))) axs[0].plot(x, df['loss'], label='Training loss') axs[0].plot(x, df['val_loss'], label='Validation loss') axs[0].legend(prop={'size': 14}) axs[0].tick_params(axis="x", labelsize=12) axs[0].tick_params(axis="y", labelsize=12) additional_metric = '' if 'dice_coefficient' in df: additional_metric = 'dice_coefficient' elif 'vod_coefficient' in df: additional_metric = 'vod_coefficient'
from keras.models import Sequential from keras.layers.core import Dense, Activation, Dropout from keras.layers.recurrent import LSTM from keras.layers.normalization import BatchNormalization as bn import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler import os try: df = pd.read_csv( "/home/rishabh/Desktop/DeepLearning/keras/Nucleus/dataset_sin.csv") temp = df.as_matrix() temp = temp.astype(float) scaler = MinMaxScaler(feature_range=(0, 1)) temp = scaler.fit_transform(temp) df = pd.DataFrame(temp) def _load_data(data, n_prev=100): docX = [] docY = [] for i in range(len(data) - n_prev): docX.append(data.iloc[i:i + n_prev].as_matrix()) docY.append(data.iloc[i + n_prev].as_matrix()) alsX = np.array(docX) alsY = np.array(docY) return alsX, alsY n = 10
from sklearn.neighbors import KNeighborsClassifier import math import codecs # hidden layer rnn_unit = 128 # feature input_size = 40 output_size = 1 lr = 0.0006 k=4 # csv_file = 'stock3005.csv' csv_file = 'fof基金20170731-1031.csv' f = open(csv_file, 'r', encoding=u'utf-8', errors='ignore') df = pd.read_csv(f) df.dropna(inplace=True) def addLayer(inputData, inSize, outSize, activity_function=None): Weights = tf.Variable(tf.random_normal([inSize, outSize])) basis = tf.Variable(tf.zeros([1, outSize]) + 0.1) weights_plus_b = tf.matmul(inputData, Weights) + basis if activity_function is None: ans = weights_plus_b else: ans = activity_function(weights_plus_b) return ans x_data = preprocessing.minmax_scale(df.iloc[:, 3:43].values,feature_range=(-1,1)) y_data = preprocessing.minmax_scale(df.iloc[:, 43:44].values,feature_range=(-1,1))
def nsw74psid_a(path): """A Subset of the nsw74psid1 Data Set The `nsw74psidA` data frame has 252 rows and 10 columns. See `nsw74psid1` for more information. This data frame contains the following columns: trt a numeric vector age a numeric vector educ a numeric vector black a numeric vector hisp a numeric vector marr a numeric vector nodeg a numeric vector re74 a numeric vector re75 a numeric vector re78 a numeric vector Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `nsw74psid_a.csv`. Returns: Tuple of np.ndarray `x_train` with 252 rows and 10 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'nsw74psid_a.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/DAAG/nsw74psidA.csv' maybe_download_and_extract(path, url, save_file_name='nsw74psid_a.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
import numpy as np import pandas as pd import os print("Imports are ready") ######################################################## ### combine shipment_id, phone_id, user_id, order_id ### ######################################################## # First, load the files containing information about shipments shipments1 = pd.read_csv( "./ngwl-predict-customer-churn/shipments/shipments2020-03-01.csv") shipments2 = pd.read_csv( "./ngwl-predict-customer-churn/shipments/shipments2020-01-01.csv") shipments3 = pd.read_csv( "./ngwl-predict-customer-churn/shipments/shipments2020-04-30.csv") shipments4 = pd.read_csv( "./ngwl-predict-customer-churn/shipments/shipments2020-06-29.csv") # Put all shipments into one table shipments = pd.concat([shipments1, shipments2, shipments3, shipments4]) # Read addresses and fix the column names addresses = pd.read_csv("./ngwl-predict-customer-churn/misc/addresses.csv") addresses.columns = ["ship_address_id", "phone_id"] # Now create the mapping through shipment_address_id with the addresses to receive phone_id shipments_and_addresses = pd.merge(addresses, shipments, on="ship_address_id") # We will take the phone id, user id, shipment id, order id, order state from here
import os import re import sys import pandas as pd df = pd.read_csv("../files/accumulo/train_data3.csv") buggy = df.loc[df['buggy']==1] clean = df.loc[df['buggy']==0] print(buggy.shape) print(clean.shape) ''' df = pd.read_csv("../files/accumulo/train_data_test.csv") print(df.columns.values) df['vector'] = df['vector'].apply(lambda v : v.replace('\n','').split(' ')) df['vector'] = df['vector'].apply(lambda v : [float(i) for i in v]) buggy_vectors = df.loc[df['buggy']==1] fixed_vectors = df.loc[df['fixed']==1]
import pandas as pd import numpy as np from sklearn.feature_selection import chi2, f_regression, mutual_info_regression from sklearn.model_selection import cross_val_predict from sklearn.metrics import r2_score import tpot train = pd.read_csv('mercedes_train.csv') y_train = train['y'].values train.drop(['ID', 'y'], axis=1, inplace=True) train = pd.get_dummies(train, drop_first=True) train = train.values config_dict = { 'sklearn.linear_model.ElasticNet': { 'l1_ratio': np.arange(0.05, 1.01, 0.05), 'alpha': np.linspace(0.001, 10.0, 100), 'normalize': [True, False] }, # 'sklearn.ensemble.ExtraTreesRegressor': { # 'n_estimators': range(50,501,50), # 'max_features': np.arange(0.05, 1.01, 0.05), # 'min_samples_split': range(2, 21), # 'min_samples_leaf': range(1, 21), # 'bootstrap': [True, False] # }, # 'sklearn.ensemble.GradientBoostingRegressor': { # 'n_estimators': range(75,251,25), # 'loss': ["ls", "lad", "huber", "quantile"],