def get_poisson_distribution(date_range, country_code, global_min, global_max): """ Args: date_range (pandas.core.series.Series): The date range of country data for the poisson distribution to be applied to. country_code (string): The country code of the country being explored. global_min (pandas.core.series.Series): A time series list of the global minimum tendencies for tor users. global_max (pandas.core.series.Series): A time series list of the global maximum tendencies for tor users. """ current_date = date_range[0] comparison_date = date_range[1] #print(date_range) # If there is not a global min or a global max on the day in question then don't even try if pd.isnull(global_min[date_range.name]) or pd.isnull(global_max[date_range.name]): return pd.Series({"country":country_code,"min":None, "max":None}) # We can't do this without both dates if np.isnan(comparison_date) or np.isnan(current_date): return pd.Series({"country":country_code,"min":None, "max":None}) else: down_score = 0 up_score = 0 # poisson.ppf(plausable_range, shape_params) min_range = global_min[date_range.name] * poisson.ppf(1-0.9999, comparison_date) max_range = global_max[date_range.name] * poisson.ppf(0.9999, comparison_date) if current_date < min_range: down_score = 1 if current_date > max_range: up_score = 1 return pd.Series({"country":country_code,"min":min_range, "max":max_range, "users":current_date, "event_censor":down_score, "event_spike":up_score})
def remove_nans(divisions): """ Remove nans from divisions These sometime pop up when we call min/max on an empty partition Examples -------- >>> remove_nans((np.nan, 1, 2)) [1, 1, 2] >>> remove_nans((1, np.nan, 2)) [1, 2, 2] >>> remove_nans((1, 2, np.nan)) [1, 2, 2] """ divisions = list(divisions) for i in range(len(divisions) - 2, -1, -1): if pd.isnull(divisions[i]): divisions[i] = divisions[i + 1] for i in range(len(divisions) - 1, -1, -1): if not pd.isnull(divisions[i]): for j in range(i + 1, len(divisions)): divisions[j] = divisions[i] break return divisions
def test_gen_drawdown_table(self, px, expected_peak, expected_valley, expected_recovery, expected_duration): rets = px.pct_change().iloc[1:] drawdowns = timeseries.gen_drawdown_table(rets, top=1) self.assertTrue( pd.isnull( drawdowns.loc[ 0, 'peak date'])) if expected_peak is None \ else self.assertEqual(drawdowns.loc[0, 'peak date'], expected_peak) self.assertTrue( pd.isnull( drawdowns.loc[0, 'valley date'])) \ if expected_valley is None else self.assertEqual( drawdowns.loc[0, 'valley date'], expected_valley) self.assertTrue( pd.isnull( drawdowns.loc[0, 'recovery date'])) \ if expected_recovery is None else self.assertEqual( drawdowns.loc[0, 'recovery date'], expected_recovery) self.assertTrue( pd.isnull(drawdowns.loc[0, 'duration'])) \ if expected_duration is None else self.assertEqual( drawdowns.loc[0, 'duration'], expected_duration)
def getTrackCountryOfOrigin(billboard_df_final): geolocator = Nominatim() track_state_of_origin = [] track_country_of_origin = [] for index_artist, row in billboard_df_final.iterrows(): if (not pd.isnull(row['latitude'])) & (not pd.isnull(row['longitude'])): try: location = geolocator.reverse(str(row['latitude']) +',' + str(row['longitude']), language='en') state = location.raw['address']['state'] if state == "Puerto Rico": country = "Puerto Rico" else: country = location.raw['address']['country'] if country == "The Netherlands": country = "Netherlands" except: print row["Artist(s)"] country = "" state = "" else: country = "" state = "" track_country_of_origin.append(country) if country == "United States of America": track_state_of_origin.append(state) else: track_state_of_origin.append("") return [track_country_of_origin, track_state_of_origin]
def wmd(d1,d2): if pd.isnull(d1) or pd.isnull(d2): return 1 d1 = emd_standardize(d1) d2 = emd_standardize(d2) vect = CountVectorizer(stop_words="english").fit([d1, d2]) names = vect.get_feature_names() v_1, v_2 = vect.transform([d1, d2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() W_ = [] for i in range(0,len(names)): try: W_.append(model[names[i]]) except KeyError: W_.append(np.zeros(300)) D_ = cosine_distances(W_) v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) v_1 = v_1 +1 v_2 = v_2 +1 v_1 /= v_1.sum() v_2 /= v_2.sum() D_ = D_.astype(np.double) D_ = D_ +1 D_ /= D_.max() wmd = emd(v_1, v_2, D_) return wmd
def test_replace2(self): N = 100 ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan ser[6:10] = 'foo' ser[20:30] = 'bar' # replace list with a single value rs = ser.replace([np.nan, 'foo', 'bar'], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() assert (rs[20:30] == -1).all() assert (pd.isnull(ser[:5])).all() # replace with different values rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() assert (rs[20:30] == -3).all() assert (pd.isnull(ser[:5])).all() # replace with different values with 2 lists rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all()
def test_conversions(data_missing): # astype to object series df = pd.DataFrame({'A': data_missing}) result = df['A'].astype('object') expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A') tm.assert_series_equal(result, expected) # convert to object ndarray # we assert that we are exactly equal # including type conversions of scalars result = df['A'].astype('object').values expected = np.array([np.nan, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): if pd.isnull(r): assert pd.isnull(e) elif is_integer(r): # PY2 can be int or long assert r == e assert is_integer(e) else: assert r == e assert type(r) == type(e)
def assert_timestamp_and_datetime_equal(result, expected, path=(), msg='', allow_datetime_coercions=False, compare_nat_equal=True, **kwargs): """ Branch for comparing python datetime (which includes pandas Timestamp) and np.datetime64 as equal. Returns raises unless ``allow_datetime_coercions`` is passed as True. """ assert allow_datetime_coercions or type(result) == type(expected), ( "%sdatetime types (%s, %s) don't match and " "allow_datetime_coercions was not set.\n%s" % ( _fmt_msg(msg), type(result), type(expected), _fmt_path(path), ) ) result = pd.Timestamp(result) expected = pd.Timestamp(result) if compare_nat_equal and pd.isnull(result) and pd.isnull(expected): return assert_equal.dispatch(object, object)( result, expected, path=path, **kwargs )
def test_constructor(self): # explicit construction index = Float64Index([1, 2, 3, 4, 5]) self.assertIsInstance(index, Float64Index) expected = np.array([1, 2, 3, 4, 5], dtype='float64') self.assert_numpy_array_equal(index.values, expected) index = Float64Index(np.array([1, 2, 3, 4, 5])) self.assertIsInstance(index, Float64Index) index = Float64Index([1., 2, 3, 4, 5]) self.assertIsInstance(index, Float64Index) index = Float64Index(np.array([1., 2, 3, 4, 5])) self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, float) index = Float64Index(np.array([1., 2, 3, 4, 5]), dtype=np.float32) self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, np.float64) index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32) self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, np.float64) # nan handling result = Float64Index([np.nan, np.nan]) self.assertTrue(pd.isnull(result.values).all()) result = Float64Index(np.array([np.nan])) self.assertTrue(pd.isnull(result.values).all()) result = Index(np.array([np.nan])) self.assertTrue(pd.isnull(result.values).all())
def load_data(): # Read file content training_file_content = pd.read_csv(TRAINING_FILE_PATH) testing_file_content = pd.read_csv(TESTING_FILE_PATH) combined_file_content = pd.concat([training_file_content, testing_file_content]) # Manipulate file content X = combined_file_content.drop([ID_COLUMN_NAME, LABEL_COLUMN_NAME], axis=1).as_matrix() categorical_features_mask_list = [] for column_vector in X.T: valid_elements_mask = np.logical_not(pd.isnull(column_vector)) if np.can_cast(type(column_vector[valid_elements_mask][0]), np.float): categorical_features_mask_list.append(False) min_value = np.min(column_vector[valid_elements_mask]) column_vector[np.logical_not(valid_elements_mask)] = min_value - 1 else: categorical_features_mask_list.append(True) column_vector[np.logical_not(valid_elements_mask)] = "Missing" column_vector[:] = perform_categorization(column_vector) encoder = OneHotEncoder(categorical_features=categorical_features_mask_list) X = encoder.fit_transform(X).toarray() # Separate the data set Y = combined_file_content[LABEL_COLUMN_NAME].as_matrix() ID = combined_file_content[ID_COLUMN_NAME].as_matrix() test_data_mask = pd.isnull(Y) X_train = X[np.logical_not(test_data_mask)] Y_train = Y[np.logical_not(test_data_mask)] X_test = X[test_data_mask] ID_test = ID[test_data_mask] return X_train, Y_train, X_test, ID_test
def _check_fill(meth, op, a, b, fill_value=0): exp_index = a.index.union(b.index) a = a.reindex(exp_index) b = b.reindex(exp_index) amask = isnull(a) bmask = isnull(b) exp_values = [] for i in range(len(exp_index)): if amask[i]: if bmask[i]: exp_values.append(nan) continue exp_values.append(op(fill_value, b[i])) elif bmask[i]: if amask[i]: exp_values.append(nan) continue exp_values.append(op(a[i], fill_value)) else: exp_values.append(op(a[i], b[i])) result = meth(a, b, fill_value=fill_value) expected = Series(exp_values, exp_index) assert_series_equal(result, expected)
def _write_data_dates(self): convert_dates = self._convert_dates data = self.datarows byteorder = self._byteorder TYPE_MAP = self.TYPE_MAP MISSING_VALUES = self.MISSING_VALUES typlist = self.typlist for row in data: #row = row.squeeze().tolist() # needed for structured arrays for i, var in enumerate(row): typ = ord(typlist[i]) #NOTE: If anyone finds this terribly slow, there is # a vectorized way to convert dates, see genfromdta for going # from int to datetime and reverse it. will copy data though if i in convert_dates: var = _datetime_to_stata_elapsed(var, self.fmtlist[i]) if typ <= 244: # we've got a string if isnull(var): var = "" # missing string if len(var) < typ: var = _pad_bytes(var, len(var) + 1) self._write(var) else: if isnull(var): # this only matters for floats var = MISSING_VALUES[typ] self._write(struct.pack(byteorder+TYPE_MAP[typ], var))
def compute(self, df, chunk_rows=None): assert self.columns for column in self.columns: if column not in df.columns: df[column] = numpy.nan rows_to_annotate = pandas.isnull(df[self.columns[0]]) for column in self.columns[1:]: rows_to_annotate = rows_to_annotate | pandas.isnull(df[column]) while rows_to_annotate.sum() > 0: if chunk_rows: this_chunk_rows = rows_to_annotate & ( rows_to_annotate.cumsum() <= chunk_rows) else: this_chunk_rows = rows_to_annotate num_remaining = rows_to_annotate.sum() logging.info("%s: %d / %d (%0.1f%%) remaining. Processing %d rows." % ( self.name, num_remaining, len(rows_to_annotate), num_remaining * 100.0 / len(rows_to_annotate), this_chunk_rows.sum())) rows_to_annotate = rows_to_annotate & (~ this_chunk_rows) if this_chunk_rows.sum() > 0: start = time.time() df.ix[this_chunk_rows, self.columns] = self.process_chunk( df.ix[this_chunk_rows].copy())[self.columns] logging.info("Processed in %f0.2 sec" % (time.time() - start)) yield this_chunk_rows.sum()
def test_multiple_children_both_missing(self, entityset, extra_session_df, wishlist_df, true_sessions_lti): # test all instances in neither child sessions = entityset['sessions'] # add row to sessions to create session with no events sessions.update_data(extra_session_df) entityset.entity_from_dataframe(entity_id="wishlist_log", dataframe=wishlist_df, index='id', make_index=True, time_index='datetime') relationship = Relationship(entityset['sessions']['id'], entityset['wishlist_log']['session_id']) entityset.add_relationship(relationship) entityset.add_last_time_indexes() sessions = entityset['sessions'] # wishlist has 2 newer events and one is NaT true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") true_sessions_lti[6] = pd.NaT assert len(sessions.last_time_index) == 7 sorted_lti = sessions.last_time_index.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_leaf_no_time_index(self, entityset): entityset.add_last_time_indexes() stores = entityset['stores'] true_lti = pd.Series([None for x in range(6)], dtype='datetime64[ns]') assert len(true_lti) == len(stores.last_time_index) for v1, v2 in zip(stores.last_time_index, true_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children_left_missing(self, entityset, extra_session_df, wishlist_df, true_sessions_lti): # test all instances in right child sessions = entityset['sessions'] # add row to sessions so not all session instances are in log sessions.update_data(extra_session_df) # add row to wishlist df so new session instance in in wishlist_log row_values = {'session_id': 6, 'datetime': pd.Timestamp("2011-04-11 11:11:11"), 'product_id': 'toothpaste'} row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) entityset.entity_from_dataframe(entity_id="wishlist_log", dataframe=df, index='id', make_index=True, time_index='datetime') relationship = Relationship(entityset['sessions']['id'], entityset['wishlist_log']['session_id']) entityset.add_relationship(relationship) entityset.add_last_time_indexes() # now wishlist_log has newer events for 3 session ids true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") assert len(sessions.last_time_index) == 7 sorted_lti = sessions.last_time_index.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children_all_combined(self, entityset, extra_session_df, wishlist_df, true_sessions_lti): # test some instances in right, some in left, all when combined sessions = entityset['sessions'] # add row to sessions so not all session instances are in log sessions.update_data(extra_session_df) # add row to wishlist_log so extra session has child instance row_values = {'session_id': 6, 'datetime': pd.Timestamp("2011-04-11 11:11:11"), 'product_id': 'toothpaste'} row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) # drop instance 4 so wishlist_log does not have session id 3 instance df.drop(4, inplace=True) entityset.entity_from_dataframe(entity_id="wishlist_log", dataframe=df, index='id', make_index=True, time_index='datetime') relationship = Relationship(entityset['sessions']['id'], entityset['wishlist_log']['session_id']) entityset.add_relationship(relationship) entityset.add_last_time_indexes() # wishlist has newer events for 2 sessions true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") assert len(sessions.last_time_index) == 7 sorted_lti = sessions.last_time_index.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def filter_pair(self, lstring, rstring): """Checks if the input strings get dropped by the overlap filter. Args: lstring,rstring (string): input strings Returns: A flag indicating whether the string pair is dropped (boolean). """ # If one of the inputs is missing, then check the allow_missing flag. # If it is set to True, then pass the pair. Else drop the pair. if pd.isnull(lstring) or pd.isnull(rstring): return (not self.allow_missing) # check for empty string if (not lstring) or (not rstring): return True # tokenize input strings ltokens = self.tokenizer.tokenize(lstring) rtokens = self.tokenizer.tokenize(rstring) num_overlap = overlap(ltokens, rtokens) if COMP_OP_MAP[self.comp_op](num_overlap, self.overlap_size): return False else: return True
def zeitsci_grant_normalize_wrapper(x): from_year = np.NaN if pd.isnull(x['GrantYear']): if pd.isnull(x['StartDate']): return np.NaN else: if len([i for i in x['StartDate'].split("/") if len(i) == 4]) != 1: return np.NaN else: from_year = [i for i in x['StartDate'].split("/") if len(i) == 4][0] else: from_year = x['GrantYear'] input_dict = { 'amount': x['Amount'], 'block': x['OrganizationBlock'], 'amount_cur': x['FundCurrency'], 'from_year': int(from_year) } if any(pd.isnull(i) for i in input_dict.values()): return np.NaN return zeitsci_normalize(input_dict['amount'] , input_dict['block'] , input_dict['amount_cur'] , input_dict['from_year'])
def __load_dataset__(log, enroll_ids, base_date): """get all instances in this time window""" X = IO.load_enrollments().set_index('enrollment_id')\ .ix[enroll_ids].reset_index() for f in features.METHODS: X_ = f.extract(base_date) if X_ is None: print('%s returns None' % repr(f.__name__)) continue if np.any(pd.isnull(X_)): raise RuntimeError('%s can generate NA(s)' % repr(f.__name__)) X = pd.merge(X, X_, how='left', on='enrollment_id') if np.any(pd.isnull(X)): raise RuntimeError('%s does not generate features of all ' 'enrollments' % repr(f.__name__)) active_eids = set(log[(log['time'] > base_date) & (log['time'] <= base_date + timedelta(days=10))] ['enrollment_id']) y = [int(eid not in active_eids) for eid in enroll_ids] del X['enrollment_id'] del X['username'] del X['course_id'] return X.as_matrix(), np.array(y, dtype=np.int)
def omega_ratio(dataframe, MAR): '''Calculate the Omega ratio of target index Args: dataframe is the dataframe passed by concat_data() function index_name is the index we want to calculate, must be consistent with index name in the excel sheet MAR is the minimum acceptable return, used for calculating the excess return order is the number of partial moment, here is one, int format start_gap is the gap at the beginning of the data, which is a six month blank period without. the value is defined by a global variable start_gap year_list is the initial global variable which defines the typical year label for static table output end_point is given by get_end_year function, which is the biggest list year in the table [1,3,5,7,10,15] Returns: This method return the Omega ratio dataframe for target index across differnt year length ''' year_list = [12,36,60,84,120,180] Omega_df = pd.DataFrame(index = dataframe.columns) # Force all nan in dataframe to be np.nan dataframe = dataframe.fillna(np.nan) # Calculation for i in year_list: for j in dataframe.columns: # Since np.nan+np.array cannot exclude the NaN scienairo,(due to the >MAR condition), we need to mannually check the NaN problem if np.prod(~pd.isnull(dataframe[j].iloc[-i:]))==0: Omega_df.loc[j,'%d_Months' % i] = np.nan elif np.prod(~pd.isnull(dataframe[j].iloc[-i:]))!=0: Omega_df.loc[j,'%d_Months' % i] = np.sum(dataframe[j].iloc[-i:][dataframe[j].iloc[-i:]>MAR]-MAR**(1/12))\ /-np.sum(dataframe[j].iloc[-i:][dataframe[j].iloc[-i:]<MAR]-MAR**(1/12)) for j in dataframe.columns: Inception = int(np.count_nonzero(~np.isnan(dataframe[j]))) Omega_df.loc[j,'Since Inception'] = np.sum(dataframe[j].iloc[-Inception:][dataframe[j].iloc[-Inception:]>MAR]-MAR**(1/12),axis=0)\ /-np.sum(dataframe[j].iloc[-Inception:][dataframe[j].iloc[-Inception:]<MAR]-MAR**(1/12),axis=0) return Omega_df
def test_minmax(self): # monotonic idx1 = TimedeltaIndex(['1 days', '2 days', '3 days']) self.assertTrue(idx1.is_monotonic) # non-monotonic idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT']) self.assertFalse(idx2.is_monotonic) for idx in [idx1, idx2]: self.assertEqual(idx.min(), Timedelta('1 days')), self.assertEqual(idx.max(), Timedelta('3 days')), self.assertEqual(idx.argmin(), 0) self.assertEqual(idx.argmax(), 2) for op in ['min', 'max']: # Return NaT obj = TimedeltaIndex([]) self.assertTrue(pd.isnull(getattr(obj, op)())) obj = TimedeltaIndex([pd.NaT]) self.assertTrue(pd.isnull(getattr(obj, op)())) obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT]) self.assertTrue(pd.isnull(getattr(obj, op)()))
def cosine(arr1, arr2): if arr1 is None or arr2 is None: return np.NaN if pd.isnull(arr1) or pd.isnull(arr2): return np.NaN sim = jpype.JClass('build.SimilarityFunction')() return sim.cosine(arr1, arr2)
def normalized_price(price_df): """ Return the normalized price of a series :ARGS: price_df: :class:`pandas.Series` or :class:`pandas.DataFrame` :RETURNS: same as the input """ if isinstance(price_df, pandas.Series): if pandas.isnull(price_df).any(): print "This series contains null values" return else: return price_df.div(price_df[0]) elif isinstance(price_df, pandas.DataFrame): if pandas.isnull(price_df).any().any(): print "This DataFrame contains null values" return else: return price_df.div(price_df.iloc[0, :] ) else: print "Input must be pandas.Series or pandas.DataFrame" return
def get_address_line(self, index, address1, city, state, zip_code): # required: print message for exception if not pd.isnull(address1): #address = street address1 = str(re.sub(r'[^\x00-\x7f]', r' ', address1.strip())) #address1 = ' '.join(address1.split()) print 1, address1 #print type(address) else: raise ValueError('Missing street value at row ' + str(index) + '.') #address = '' if not pd.isnull(city): city_name = str(city) else: raise ValueError('Missing city value at row ' + str(index) + '.') #city_name ='' if not pd.isnull(zip_code): zip = str(zip_code) #print zip #print type(zip) else: raise ValueError('Missing zip code value at row ' + str(index) + '.') #print address #print type(address) final_line = address1 + ", " + city_name + ', ' + config.state_abbreviation_upper + ' ' + zip final_line = ' '.join(final_line.split()) #print index, final_line return final_line
def _get_daily_spot_value(self, asset, column, dt): reader = self._get_pricing_reader('daily') if column == "last_traded": last_traded_dt = reader.get_last_traded_dt(asset, dt) if isnull(last_traded_dt): return pd.NaT else: return last_traded_dt elif column in OHLCV_FIELDS: # don't forward fill try: return reader.get_value(asset, dt, column) except NoDataOnDate: return np.nan elif column == "price": found_dt = dt while True: try: value = reader.get_value( asset, found_dt, "close" ) if not isnull(value): if dt == found_dt: return value else: # adjust if needed return self.get_adjusted_value( asset, column, found_dt, dt, "minute", spot_value=value ) else: found_dt -= self.trading_calendar.day except NoDataOnDate: return np.nan
def oncall(self, controller, currentevlog, currentresplog): # important to super here to reset starttime super(NBackEvent, self).oncall(controller, currentevlog, currentresplog) if self.nshift == 0: currentname = self.name else: try: currentname = currentevlog.iloc[self.nshift]['name'] except IndexError: currentname = numpy.nan except: raise try: previousname = currentevlog.iloc[self.nshift-self.nback]['name'] except IndexError: previousname = numpy.nan except: raise self.wasrep = 0. if pandas.isnull(currentname) or pandas.isnull(previousname): self.wasrep = numpy.nan elif currentname == previousname: self.wasrep = 1. if self.verbose: print 'current=%s\t last=%s\twasrep=%s' % \ (currentname, previousname, self.wasrep) # so now we just reassign the keys if self.wasrep: self.correct = self.repkey[:] self.incorrect = self.notrepkey[:] else: self.correct = self.notrepkey[:] self.incorrect = self.repkey[:] return
def test_minmax(self): for tz in self.tz: # monotonic idx1 = pd.DatetimeIndex([pd.NaT, '2011-01-01', '2011-01-02', '2011-01-03'], tz=tz) self.assertTrue(idx1.is_monotonic) # non-monotonic idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03', '2011-01-02', pd.NaT], tz=tz) self.assertFalse(idx2.is_monotonic) for idx in [idx1, idx2]: self.assertEqual(idx.min(), pd.Timestamp('2011-01-01', tz=tz)) self.assertEqual(idx.max(), pd.Timestamp('2011-01-03', tz=tz)) for op in ['min', 'max']: # Return NaT obj = DatetimeIndex([]) self.assertTrue(pd.isnull(getattr(obj, op)())) obj = DatetimeIndex([pd.NaT]) self.assertTrue(pd.isnull(getattr(obj, op)())) obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT]) self.assertTrue(pd.isnull(getattr(obj, op)()))
def read_rdata(rdata_fullpath, table_name): """ Returns the pandas DataFrame """ from rpy2.robjects import pandas2ri, r pandas2ri.activate() # we want forward slashes for R rdata_fullpath_forR = rdata_fullpath.replace("\\", "/") print "Loading %s" % rdata_fullpath_forR # read in the data from the R session with python r['load'](rdata_fullpath_forR) # check that it's there table_df = pandas2ri.ri2py(r['model_summary']) # fillna for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " Found %5d NA values in column %s" % (nullcount, col) table_df = table_df.fillna(0) for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " -> Found %5d NA values in column %s" % (nullcount, col) print "Read %d lines from %s" % (len(table_df), rdata_fullpath) return table_df
def metacsv_dataframe_to_dataarray(dataframe, names=None, attrs=None): global xr if xr is None: _import_xarray() dataframe = dataframe.copy() if attrs is None: attrs = dataframe.attrs coords = dataframe.coords.copy() dataframe.index.names = [ str(ind) if not pd.isnull(ind) else 'ind_{}'.format(i) for i, ind in enumerate(dataframe.index.names)] if dataframe.coords == None: coords.update({c: None for c in dataframe.index.names}) dataframe.columns.names = [ str(c) if not pd.isnull(c) else 'coldim_{}'.format(i) for i, c in enumerate(dataframe.columns.names)] colnames = dataframe.columns.names series = dataframe._constructor_sliced(dataframe.stack(colnames)) coords.update({c: None for c in colnames}) series.coords.update(coords) return metacsv_series_to_dataarray(series, attrs=attrs)
dev_prev_365 = stock_data['Close'].rolling(window=365, min_periods=2).std() stock_data['Ratio_past5_365'] = stock_data['Avg_day_5'] / dev_prev_365 # Shifting the indexes of the dataframe by 1 period stock_data.shift() print(stock_data.head(10)) print(stock_data.tail(10)) # Removing rows from the dataset with dates before 1951-10-03 stock_data = stock_data[ stock_data['Date'] > datetime(year=1951, month=1, day=2)] print(stock_data.head(5)) # Removing NAN values from the dataset stock_data = stock_data.dropna(axis=0) print(pd.isnull(stock_data).sum()) # Splitting the dataset into train and test data train = stock_data[stock_data['Date'] < datetime(year=2013, month=1, day=1)] test = stock_data[stock_data['Date'] > datetime(year=2013, month=1, day=1)] print(train.tail(3)) print(test.head(3)) # Training a linear regression model to predict the Close stock prices from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error linear_model = LinearRegression() model_fit = linear_model.fit( train[['Avg_day_5', 'Sdev_day_5', 'Ratio_past5_365']], train['Close']) predict_train = model_fit.predict(
def main(xlsx_path, out_path, action, viral_submission=False): # PARSE STUDIES ################# xl_sheet = read_ena_xlsx_sheet(xlsx_path, sheet_name="ENA_study") if xl_sheet.shape[0] < 1: raise ValueError("No entries found in studies sheet") studies_col = ["alias", "title", "study_type", "study_abstract"] try: studies_dict = extract_data(xl_sheet, studies_col) except AssertionError as e: print("Sheet ENA_study: ", e) raise # PARSE SAMPLES ################# xl_sheet = read_ena_xlsx_sheet(xlsx_path, sheet_name="ENA_sample") if xl_sheet.shape[0] < 1: raise ValueError("No entries found in samples") if viral_submission: samples_cols = [ "alias", "title", "scientific_name", "sample_description", "geographic location (country and/or sea)", "host common name", "host health state", "host sex", "host scientific name", "collector name", "collection date", "collecting institution", "isolate", ] else: samples_cols = ["alias", "title", "scientific_name", "sample_description"] try: samples_dict = extract_data(xl_sheet, samples_cols) except AssertionError as e: print("Sheet ENA_sample: ", e) raise # PARSE EXPERIMENTS ################# xl_sheet = read_ena_xlsx_sheet(xlsx_path, sheet_name="ENA_experiment") if xl_sheet.shape[0] < 1: raise ValueError("No experiments found in experiments sheet") exp_columns = [ "alias", "title", "study_alias", "sample_alias", "design_description", "library_name", "library_strategy", "library_source", "library_selection", "library_layout", "insert_size", "library_construction_protocol", "platform", "instrument_model", ] try: experiments_dict = extract_data(xl_sheet, exp_columns) except AssertionError as e: print("Sheet ENA_experiment: ", e) raise # PARSE RUNS SHEET ################# xl_sheet = read_ena_xlsx_sheet(xlsx_path, sheet_name="ENA_run") if xl_sheet.shape[0] < 1: raise ValueError("No entries found in runs sheet") run_cols = ["alias", "experiment_alias", "file_name", "file_format"] try: runs_dict = extract_data(xl_sheet, run_cols, unique_key="file_name") except AssertionError as e: print("Sheet ENA_run: ", e) raise # DROP COMMENTS ############### studies_dict = { k: v for k, v in studies_dict.items() if k in set([v["study_alias"] for k, v in experiments_dict.items()]) } assert bool(studies_dict), "No entries found in studies" experiments_dict = { k: v for k, v in experiments_dict.items() if v["study_alias"] in studies_dict.keys() } assert bool(experiments_dict), "No entries found in experiments" samples_dict = { k: v for k, v in samples_dict.items() if k in set([v["sample_alias"] for k, v in experiments_dict.items()]) } assert bool(samples_dict), "No entries found in samples" runs_dict = { k: v for k, v in runs_dict.items() if v["experiment_alias"] in experiments_dict.keys() } assert bool(runs_dict), "No entries found in runs" # WRITE HEADERS TO TABLES studies_table = open(pathlib.Path(out_path) / "studies.tsv", "w") studies_table.write( "\t".join( [ "alias", "status", "accession", "title", "study_type", "study_abstract", "pubmed_id", "submission_date", ] ) + "\n" ) samples_table = open(pathlib.Path(out_path) / "samples.tsv", "w") if viral_submission: samples_table.write( "\t".join( [ "alias", "status", "accession", "title", "scientific_name", "taxon_id", "sample_description", "collection_date", "geographic_location", "host_common_name", "host_subject_id", "host_health_state", "host_sex", "host_scientific_name", "collector_name", "collecting_institution", "isolate", "submission_date", ] ) + "\n" ) else: samples_table.write( "\t".join( [ "alias", "status", "accession", "title", "scientific_name", "taxon_id", "sample_description", "submission_date", ] ) + "\n" ) experiments_table = open(pathlib.Path(out_path) / "experiments.tsv", "w") experiments_table.write( "\t".join( [ "alias", "status", "accession", "title", "study_alias", "sample_alias", "design_description", "library_name", "library_strategy", "library_source", "library_selection", "library_layout", "insert_size", "library_construction_protocol", "platform", "instrument_model", "submission_date", ] ) + "\n" ) runs_table = open(pathlib.Path(out_path) / "runs.tsv", "w") runs_table.write( "\t".join( [ "alias", "status", "accession", "experiment_alias", "file_name", "file_format", "file_checksum", "submission_date", ] ) + "\n" ) action = action # WRITE DICTIONARIES TO TABLE FILES # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? # dt_oobj = datetime.now(tz=None) # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") for study_alias, study in studies_dict.items(): # study_alias = study_alias + '_' + timestamp studies_table.write( "\t".join( [ study_alias, action, "ENA_accession", study["title"], study["study_type"], study["study_abstract"], "", "ENA_submission_data", ] ) + "\n" ) # assuming no pubmed_id for sample_alias, sample in samples_dict.items(): # sample_alias = sample_alias + '_' + timestamp if viral_submission: if sample["collector name"] == "": sample["collector name"] = "unknown" samples_table.write( "\t".join( [ sample_alias, action, "ena_accession", sample["title"], sample["scientific_name"], "tax_id_updated_by_ENA", sample["sample_description"], sample["collection date"], sample["geographic location (country and/or sea)"], sample["host common name"], "host subject id", sample["host health state"], sample["host sex"], sample["host scientific name"], sample["collector name"], sample["collecting institution"], sample["isolate"], "ENA_submission_date", ] ) + "\n" ) else: samples_table.write( "\t".join( [ sample_alias, action, "ena_accession", sample["title"], sample["scientific_name"], "tax_id_updated_by_ENA", sample["sample_description"], ] ) + "\n" ) for exp_alias, exp in experiments_dict.items(): # should I check here if any experiment has a study or sample alias that is incorrect? # (not listed in the samples or study dict) # process the experiments for this sample if exp["sample_alias"] == sample_alias: if pd.isnull(exp["library_name"]): if exp["sample_alias"] in exp_alias: lib_alias = exp_alias else: lib_alias = exp_alias + "_" + exp["sample_alias"] else: lib_alias = exp["library_name"] experiments_table.write( "\t".join( [ exp_alias, action, "ena_accession", exp["title"], exp["study_alias"], sample_alias, exp["design_description"], lib_alias, exp["library_strategy"], exp["library_source"], exp["library_selection"], exp["library_layout"].lower(), str(int(exp["insert_size"])), exp["library_construction_protocol"], exp["platform"], exp["instrument_model"], "submission_date_ENA", ] ) + "\n" ) for file_name, run in runs_dict.items(): if run["experiment_alias"] == exp_alias: runs_table.write( "\t".join( [ run["alias"], action, "ena_run_accession", exp_alias, file_name, FILE_FORMAT, "file_checksum", "submission_date_ENA", ] ) + "\n" ) studies_table.close() samples_table.close() experiments_table.close() runs_table.close()
#-*- coding:utf-8 _*- """ @author:Administrator @file: test_something_doubt.py @time: 2018/8/16 """ import pandas as pd data = pd.read_csv('./month_6_1.csv', header=0) data_null_len = len(data[pd.isnull(data['bedrooms'])]) print(data_null_len) # 获取缺失值的另一种方式:直接让条件data[col] == data[col] 可能是缺失值不会相等造成的没有缺失的才会相等; data_new = data.loc[data['bedrooms'] == data['bedrooms']] print(data_new.shape[0]) print(data.shape[0])
def _set_pctChg(pctChg): if pd.isnull(pctChg): return None else: return pctChg
# method 1 uses the pandas isin function not_in_DF1_method1 = DF2[~DF2['AA'].isin(DF1['AA'])] print(not_in_DF1_method1) not_in_DF2_method1 = DF1[~DF1['AA'].isin(DF2['AA'])] print(not_in_DF2_method1) # method 2 is more generic but produces the same results DF1list = [True] * DF1.shape[0] DF2list = [True] * DF2.shape[0] DF1.loc[:, 'inDF1'] = DF1list DF2.loc[:, 'inDF2'] = DF2list bigDF = pd.merge(DF1, DF2, how="outer") not_in_DF1_method2 = bigDF.drop('inDF2', axis=1) not_in_DF1_method2 = not_in_DF1_method2[pd.isnull(not_in_DF1_method2).any( axis=1)] not_in_DF1_method2 = not_in_DF1_method2.drop('inDF1', axis=1) print(not_in_DF1_method2) not_in_DF2_method2 = bigDF.drop('inDF1', axis=1) not_in_DF2_method2 = not_in_DF2_method2[pd.isnull(not_in_DF2_method2).any( axis=1)] not_in_DF2_method2 = not_in_DF2_method2.drop('inDF2', axis=1) print(not_in_DF2_method2)
print('movies:\n', movies.head()) print('movies columns:\n', movies.columns) # 1、合并数据集 credits.rename(columns={'movie_id': 'id'}, inplace=True) print('credit columns:\n', credits.columns) # 根据电影id和title 将二者合并 data = pd.merge(left=credits, right=movies, on=['id', 'title'], how='outer') # 获取合并后的电影信息 print('data columns:\n', data.columns) print('data shape:\n', data.shape) # 检测缺失值 res_nul = pd.isnull(data).sum() print('res_nul:\n', res_nul) # 获取缺失电影的名称 bool_mask_name_nul = pd.isnull(data.loc[:, 'release_date']) # 显示确实电影名称 movies_names = data.loc[bool_mask_name_nul, 'original_title'].values[0] print('movies_names:\n', movies_names) # America Is Still the Place # 首映日填充 data.loc[bool_mask_name_nul, 'release_date'] = '2014-06-01' # 获取丢失时长的电影 bool_mask_runtime = pd.isnull(data.loc[:, 'runtime']) movie_unruntime = data.loc[bool_mask_runtime, 'original_title'].values print('movies unnames:\n', movie_unruntime)
def _apply_predicate(op, val, col_stats): # Sanitize operator if op not in {"=", "==", "!=", "<", "<=", ">", ">=", "in", "not in"}: raise ValueError(f"'{op}' is not a valid operator in predicates.") col_min = col_stats.get("minimum", None) col_max = col_stats.get("maximum", None) col_sum = col_stats.get("sum", None) # Apply operator if op == "=" or op == "==": if _apply_filter_not_eq(val, col_stats): return False # TODO: Replace pd.isnull with # cudf.isnull once it is implemented if pd.isnull(val) and not col_stats["has_null"]: return False if not _apply_filter_bool_eq(val, col_stats): return False elif op == "!=": if ( col_min is not None and col_max is not None and val == col_min and val == col_max ): return False if _apply_filter_bool_eq(val, col_stats): return False elif col_min is not None and ( (op == "<" and val <= col_min) or (op == "<=" and val < col_min) ): return False elif col_max is not None and ( (op == ">" and val >= col_max) or (op == ">=" and val > col_max) ): return False elif ( col_sum is not None and op == ">" and ( (col_min is not None and col_min >= 0 and col_sum <= val) or (col_max is not None and col_max <= 0 and col_sum >= val) ) ): return False elif ( col_sum is not None and op == ">=" and ( (col_min is not None and col_min >= 0 and col_sum < val) or (col_max is not None and col_max <= 0 and col_sum > val) ) ): return False elif op == "in": if (col_max is not None and col_max < min(val)) or ( col_min is not None and col_min > max(val) ): return False if all(_apply_filter_not_eq(elem, col_stats) for elem in val): return False elif op == "not in" and col_min is not None and col_max is not None: if any(elem == col_min == col_max for elem in val): return False col_range = None if isinstance(col_min, int): col_range = range(col_min, col_max) elif isinstance(col_min, datetime.datetime): col_range = pd.date_range(col_min, col_max) if col_range and all(elem in val for elem in col_range): return False return True
def assemble_initial_source_list(catalog_vnum): """ Given LIST_OF_LISTS_STARTER_v0.5.csv , exported from /doc/list_of_cluster_member_lists.ods, clean and concatenate the cluster members. Flatten the resulting list on source_ids, joining the cluster, age, and bibcode columns into comma-separated strings. """ metadf = pd.read_csv( os.path.join(clusterdatadir, 'LIST_OF_LISTS_STARTER_V0.6.csv') ) metadf['bibcode'] = metadf.ads_link.str.extract("abs\/(.*)\/") N_stars_in_lists = [] Nstars_with_age_in_lists = [] dfs = [] # for each table, concatenate into a dataframe of source_id, cluster, # log10age ("age"). for ix, r in metadf.iterrows(): print(79*'-') print(f'Beginning {r.reference_id}...') csvpath = os.path.join(clusterdatadir, r.csv_path) assert os.path.exists(csvpath) df = pd.read_csv(csvpath) df['reference_id'] = r.reference_id df['reference_bibcode'] = r.bibcode if 'HATSandHATNcandidates' in r.reference_id: df['reference_bibcode'] = 'JoelHartmanPrivComm' colnames = df.columns # # every CSV file needs a Gaia DR2 "source_id" column # if "source" in colnames: df = df.rename( columns={"source":"source_id"} ) # # every CSV file needs a "cluster name" name column # if "assoc" in colnames: df = df.rename( columns={"assoc":"cluster"} # moving groups ) colnames = df.columns if "cluster" not in colnames: msg = ( f'WRN! for {r.reference_id} did not find "cluster" column. '+ f'Appending the reference_id ({r.reference_id}) as the cluster ID.' ) print(msg) df['cluster'] = r.reference_id # # every CSV file needs an "age" column, which can be null, but # preferably is populated. # if "age" not in colnames: if r.reference_id in [ 'CantatGaudin2018a', 'CantatGaudin2020a', 'CastroGinard2020', 'GaiaCollaboration2018lt250', 'GaiaCollaboration2018gt250' ]: # get clusters and ages from CG20b; use them as the reference cg20bpath = os.path.join( clusterdatadir, "v05/CantatGaudin20b_cut_cluster_source_age.csv" ) df_cg20b = pd.read_csv(cg20bpath) cdf_cg20b = df_cg20b.drop_duplicates(subset=['cluster','age'])[ ['cluster', 'age'] ] # cleaning steps if r.reference_id == 'CastroGinard2020': df['cluster'] = df.cluster.str.replace('UBC', 'UBC_') elif r.reference_id in [ 'GaiaCollaboration2018lt250', 'GaiaCollaboration2018gt250' ]: df['cluster'] = df.cluster.str.replace('NGC0', 'NGC_') df['cluster'] = df.cluster.str.replace('NGC', 'NGC_') df['cluster'] = df.cluster.str.replace('IC', 'IC_') df['cluster'] = df.cluster.str.replace('Stock', 'Stock_') df['cluster'] = df.cluster.str.replace('Coll', 'Collinder_') df['cluster'] = df.cluster.str.replace('Trump02', 'Trumpler_2') df['cluster'] = df.cluster.str.replace('Trump', 'Trumpler_') _df = df.merge(cdf_cg20b, how='left', on=['cluster']) assert len(_df) == len(df) df['age'] = _df['age'] print( f'For {r.reference_id} got {len(df[~pd.isnull(df.age)])}/{len(df)} finite ages via CantatGaudin2020b crossmatch on cluster ID.' ) del _df elif ( ('Zari2018' in r.reference_id) or ('Oh2017' in r.reference_id) or ('Ujjwal2020' in r.reference_id) or ('CottenSong' in r.reference_id) or ('HATSandHATNcandidates' in r.reference_id) or ('SIMBAD' in r.reference_id) or ('Gagne2018' in r.reference_id) ): age = np.ones(len(df))*np.nan df['age'] = age else: age_mapper = lambda k: AGE_LOOKUP[k] age = df.cluster.apply(age_mapper) df['age'] = age N_stars_in_lists.append(len(df)) Nstars_with_age_in_lists.append(len(df[~pd.isnull(df.age)])) dfs.append(df) assert ( 'source_id' in df.columns and 'cluster' in df.columns and 'age' in df.columns ) metadf["Nstars"] = N_stars_in_lists metadf["Nstars_with_age"] = Nstars_with_age_in_lists # concatenation. nomagcut_df = pd.concat(dfs) assert np.sum(metadf.Nstars) == len(nomagcut_df) # clean ages sel = (nomagcut_df.age == -np.inf) nomagcut_df.loc[sel,'age'] = np.nan nomagcut_df['age'] = np.round(nomagcut_df.age,2) # # merge duplicates, and ','-join the cluster id strings, age values # scols = ['source_id', 'cluster', 'age', 'reference_id', 'reference_bibcode'] nomagcut_df = nomagcut_df[scols].sort_values(by='source_id') for c in nomagcut_df.columns: nomagcut_df[c] = nomagcut_df[c].astype(str) print(79*'-') print('Beginning aggregation (takes ~2-3 minutes for v0.5)...') _ = nomagcut_df.groupby('source_id') df_agg = _.agg({ "cluster": list, "age": list, "reference_id": list, "reference_bibcode": list }) u_sourceids = np.unique(nomagcut_df.source_id) N_sourceids = len(u_sourceids) assert len(df_agg) == N_sourceids df_agg["source_id"] = df_agg.index # turn the lists to comma separated strings. outdf = pd.DataFrame({ "source_id": df_agg.source_id, "cluster": [','.join(map(str, l)) for l in df_agg['cluster']], "age": [','.join(map(str, l)) for l in df_agg['age']], "mean_age": [np.round(np.nanmean(np.array(l).astype(float)),2) for l in df_agg['age']], "reference_id": [','.join(map(str, l)) for l in df_agg['reference_id']], "reference_bibcode": [','.join(map(str, l)) for l in df_agg['reference_bibcode']], }) outpath = os.path.join( clusterdatadir, f'list_of_lists_keys_paths_assembled_v{catalog_vnum}.csv' ) metadf.to_csv(outpath, index=False) print(f'Made {outpath}') outpath = os.path.join( clusterdatadir, f'cdips_targets_v{catalog_vnum}_nomagcut.csv' ) outdf.to_csv(outpath, index=False) print(f'Made {outpath}')
def get_sample_text(condition: str, sample_type: str, cell_type: str = None, replicate: str = None, lane: str = None): """ Construct the text for a sample name by concatenating the respective properties of the sample. The format of the name is: [cell-type, ] <condition>, <sample_type> [(<replicate>)] [(lane: <lane>)] The optional parts are skipped if the respective value is None. Parameters ---------- condition: string The name of the condition for the sample, e.g., "sham.cm" sample_type: string The type of the sample, e.g., "riboseq" cell_type: string The type of cell (tissue, etc.) from which the sample came, e.g., "cm" replicate: string An identifier for the (biological) replicate, e.g., "mouse-403" lane: string An identifier for the lane of the sample, e.g., "2" Returns ------- sample_name_text: string, or None The name, constructed as indicated above. If condition is None, NaN, or a zero-length string, then None is returned. """ if pd.isnull(condition): return None if condition is None: return None if len(condition) == 0: return None sample_name = "" if cell_type is not None: sample_name = "{}{}, ".format(sample_name, cell_type) sample_name = "{}{}, {} ".format(sample_name, str(condition), str(sample_type)) if replicate is not None: sample_name = "{}({}) ".format(sample_name, str(replicate)) if lane is not None: sample_name = "{}(lane: {})".format(sample_name, str(lane)) sample_name = sample_name.strip() return sample_name
def get_target_catalog(catalog_vnum, VERIFY=1): """ 1. Assemble the target catalog (down to arbitrary brightness; i.e, just clean and concatenate). 2. Manually async query the Gaia database based on those source_ids. 3. Verify the result, and merge and write it. """ csvpath = os.path.join( clusterdatadir, f'cdips_targets_v{catalog_vnum}_nomagcut.csv' ) if not os.path.exists(csvpath): assemble_initial_source_list(catalog_vnum) df = pd.read_csv(csvpath) # made by assemble_initial_source_list above. metapath = os.path.join( clusterdatadir, f'list_of_lists_keys_paths_assembled_v{catalog_vnum}.csv' ) metadf = pd.read_csv(metapath) if VERIFY: # one-time verification verify_target_catalog(df, metadf) # e.g., cdips_v05_1-result.vot.gz votablepath = os.path.join( clusterdatadir, f'cdips_v{str(catalog_vnum).replace(".","")}_1-result.vot.gz' ) if not os.path.exists(votablepath): temppath = os.path.join(clusterdatadir, f'v{str(catalog_vnum).replace(".","")}_sourceids.csv') print(f'Wrote {temppath}') df['source_id'].to_csv( temppath, index=False ) querystr = ( "SELECT top 2000000 g.source_id, g.ra, g.dec, g.parallax, "+ "g.parallax_error, g.pmra, g.pmdec, g.phot_g_mean_mag, "+ "g.phot_rp_mean_mag, g.phot_bp_mean_mag FROM "+ f"user_lbouma.v{str(catalog_vnum).replace('.','')}_sourceids as u, gaiadr2.gaia_source AS g WHERE "+ "u.source_id=g.source_id " ) print('Now you must go to https://gea.esac.esa.int/archive/, login, and run') print(querystr) assert 0 # # NOTE: the naive implementation below doesn't work, probably because of a # # sync/async issue. given_source_ids_get_gaia_data now raises an # # error # if n_max exceeds 5e4, because the ~70k items that WERE # # returned are duds. # cols = ( # 'g.source_id, g.ra, g.dec, g.parallax, g.parallax_error, g.pmra, ' # 'g.pmdec, g.phot_g_mean_mag, g.phot_rp_mean_mag, g.phot_bp_mean_mag' # ) # gdf = given_source_ids_get_gaia_data( # np.array(df.source_id.astype(np.int64)), # f'cdips_targets_v{catalog_vnum}', # n_max=int(2e6), overwrite=False, # enforce_all_sourceids_viable=True, whichcolumns=cols, # gaia_datarelease='gaiadr2' # ) gdf = given_votable_get_df(votablepath, assert_equal='source_id') if not len(gdf) == len(df): print(79*"*") print('WRN!') print(f'Expected {len(df)} matches in Gaia DR2') print(f'Got {len(gdf)} matches in Gaia DR2') print(79*"*") verify_gaia_xmatch(df, gdf, metadf) # every queried source_id should have a result. the two that do not are # EsplinLuhman2019, 377 matches to 443 stars, and Gagne2018c, 914 matches # to 916 stars. this is 60 missing stars out of 1.5 million. we'll be okay. # so, do the merge using the GAIA xmatch results as the base. mdf = gdf.merge(df, on='source_id', how='left') # # update metadf with new info. # N_stars_in_lists = [] Nstars_with_age_in_lists = [] N_sel0 = [] N_sel1 = [] N_sel2 = [] for ix, r in metadf.iterrows(): csvpath = os.path.join(clusterdatadir, r.csv_path) assert os.path.exists(csvpath) _df = pd.read_csv(csvpath) if 'source_id' not in _df.columns: _df = _df.rename(columns={"source":"source_id"}) _sel = mdf.source_id.isin(_df.source_id) N_stars_in_lists.append(len(mdf[_sel])) _selage = (~pd.isnull(mdf.age)) & _sel Nstars_with_age_in_lists.append(len(mdf[_selage])) _sel0 = ( _sel & (mdf.phot_rp_mean_mag < 16) ) _sel1 = ( _sel & ( (mdf.phot_rp_mean_mag < 16) | ( (mdf.parallax/mdf.parallax_error > 5) & (mdf.parallax > 10) ) ) ) _sel2 = _sel1 & (mdf.mean_age > -1) N_sel0.append(len(mdf[_sel0])) N_sel1.append(len(mdf[_sel1])) N_sel2.append(len(mdf[_sel2])) metadf["N_gaia"] = N_stars_in_lists metadf["N_gaia_withage"] = Nstars_with_age_in_lists metadf["N_Rplt16"] = N_sel0 metadf["N_Rplt16_orclose"] = N_sel1 metadf["N_Rplt16_orclose_withage"] = N_sel2 metadf['Nstars_m_Ngaia'] = metadf.Nstars - metadf.N_gaia # # save the output # csvpath = os.path.join( clusterdatadir, f'cdips_targets_v{catalog_vnum}_nomagcut_gaiasources.csv' ) if not os.path.exists(csvpath): mdf.to_csv(csvpath, index=False) print(f'Wrote {csvpath}') else: print(f'Found {csvpath}') metapath = os.path.join( clusterdatadir, f'list_of_lists_keys_paths_assembled_v{catalog_vnum}_gaiasources.csv' ) if not os.path.exists(metapath): metadf.sort_values(by='Nstars', ascending=False).to_csv(metapath, index=False) print(f'Wrote {metapath}') else: print(f'Found {metapath}') # Rp<16 csvpath = os.path.join( clusterdatadir, f'cdips_targets_v{catalog_vnum}_gaiasources_Rplt16.csv' ) if not os.path.exists(csvpath): sel = (mdf.phot_rp_mean_mag < 16) smdf = mdf[sel] smdf.to_csv(csvpath, index=False) print(f'Wrote {csvpath}') else: print(f'Found {csvpath}') # Rp<16 or close csvpath = os.path.join( clusterdatadir, f'cdips_targets_v{catalog_vnum}_gaiasources_Rplt16_orclose.csv' ) if not os.path.exists(csvpath): sel = ( (mdf.phot_rp_mean_mag < 16) | ( (mdf.parallax/mdf.parallax_error > 5) & (mdf.parallax > 10) ) ) smdf = mdf[sel] smdf.to_csv(csvpath, index=False) print(f'Wrote {csvpath}') else: print(f'Found {csvpath}')
import matplotlib.pyplot as plt csv_lines = [] #preallocate list for extracted csv lines path_to_csv = "data/dataset.csv" raw_data = pd.read_csv(path_to_csv, sep=';') # encode as nominal #raw_data.user.unique(); raw_data.user = raw_data.user.map({'debora':0, 'katia':1, 'wallace':2, 'jose_carlos':3}); raw_data.user.unique() raw_data.gender.unique(); raw_data.gender = raw_data.gender.map({'Woman':1, 'Man':0}); raw_data.gender.unique() raw_data['how_tall_in_meters'] = raw_data['how_tall_in_meters'].str.replace(',', '.') raw_data['body_mass_index'] = raw_data['body_mass_index'].str.replace(',', '.') raw_data[pd.isnull(raw_data).any(axis=1)] raw_data.isnull().values.any(); raw_data.isnull().sum().sum() raw_data.drop(raw_data[raw_data.z4 == "-14420-11-2011 04:50:23,713"].index.values, inplace=True) # row 122076 -> (165633, 19) raw_data.z4 = pd.to_numeric(raw_data.z4, errors='raise'); raw_data.dtypes # z4 object -> int64 raw_data.columns raw_data[pd.isnull(raw_data).any(axis=1)] raw_data = raw_data[raw_data['body_mass_index'].notnull()] raw_data[pd.isnull(raw_data).any(axis=1)] raw_data.isnull().values.any(); raw_data.isnull().sum().sum()
def infer_schema(_data, fname, output_root='', sample_size=1.0, type_threshold=0.5, n_jobs=1, base_schema=None, base_schema_feature_colname='column', base_schema_dtype_colname='type'): """ Infer data types for all columns for the input table Parameters ---------- _data: pandas DataFrame data table to infer fname: string the output file name output_root: string, default='' the root directory for the output file sample_size: int or float(<= 1.0), default=1.0 int: number of sample rows to infer the data type (useful for large tables) float: sample size in percentage type_threshold: float(<= 1.0), default=0.5 threshold for inferring data type n_jobs: int, default=1 the number of jobs to run in parallel base_schema: pandas DataFrame, default=None data schema to base on base_schema_feature_colname: string feature_colname in base schema base_schema_dtype_colname: string dtype_colname in base schema """ # copy raw data table data = _data.copy() # open a new workbook to store all result wb = openpyxl.Workbook() ws = wb['Sheet'] ws.title = 'schema' # calculate sample size if sample_size <= 1.0: sample_size = int(data.shape[0] * sample_size) # dictionary to store dropna sample data values data_dropna_sample_values = {} for col in data.columns.values: if len(data[col].dropna()) <= sample_size: data_dropna_sample_values[col] = data[col].dropna().values else: data = data.sample(sample_size).reset_index(drop=True) data_dropna_sample_values[col] = data[col].dropna().values # use data_dropna_sample_values to infer data type for each column _n_jobs = np.min([n_jobs, len(data.columns.values)]) type_infos = Parallel(n_jobs=_n_jobs)(delayed(_infer_dtype)( data_dropna_sample_values[col], col, type_threshold) for col in data.columns.values) type_infos_df = pd.DataFrame(type_infos)[['column', 'type']] # dtype mapping for basic stat calculation data_types = {} for col in data.columns.values: data_types[col] = type_infos_df.loc[type_infos_df['column'] == col, 'type'].values[0] # get basic statistic information for all columns stat_infos = Parallel(n_jobs=_n_jobs)(delayed(_cal_column_stat)( data_dropna_sample_values[col], col, data_types[col]) for col in data.columns.values) stat_infos_df = pd.DataFrame(stat_infos) # merge dtype infomation with stat information full_infos_df = type_infos_df.merge(stat_infos_df, on='column', how='left') full_infos_df = full_infos_df[[ 'column', 'type', 'sample_value', 'sample_num_uni', 'sample_min', 'sample_median', 'sample_max', 'sample_std' ]] # if base_schema is provided, we can compare with base schema if base_schema is not None: base_schema = base_schema.rename( columns={ base_schema_feature_colname: 'base_column', base_schema_dtype_colname: 'base_type' })[['base_column', 'base_type']] full_infos_df = full_infos_df.merge(base_schema, left_on='column', right_on='base_column', how='outer') # compare with the base schema full_infos_df['base_column'] = full_infos_df['base_column'].apply( lambda x: 'column not in base table' if pd.isnull(x) else x) full_infos_df['column'] = full_infos_df['column'].apply( lambda x: 'column not in current table' if pd.isnull(x) else x) # reorder the column full_infos_df = full_infos_df[[ 'column', 'base_column', 'type', 'base_type', 'sample_value', 'sample_num_uni', 'sample_min', 'sample_median', 'sample_max', 'sample_std' ]] # add data validation for type column val_type = DataValidation(type="list", formula1='"key,numeric,str,date"', allow_blank=False) ws.add_data_validation(val_type) # get col_name, excel column mapping column_mapping = {} for i, col in enumerate(full_infos_df.columns): column_mapping[col] = xlsxwriter.utility.xl_col_to_name(i) # write everything into the worksheet for r_idx, r in enumerate( dataframe_to_rows(full_infos_df, index=False, header=True)): ws.append(r) for cell_idx, cell in enumerate( ws.iter_cols(max_col=ws.max_column, min_row=ws.max_row, max_row=ws.max_row)): cell = cell[0] if r_idx != 0: val_type.add(ws['%s%d' % (column_mapping['type'], ws.max_row)]) if cell_idx == 0: cell.font = Font(bold=True) else: cell.style = 'Accent5' # add conditional formating red_fill = PatternFill(bgColor="FFC7CE") red_font = Font(color="9C0006") green_fill = PatternFill(bgColor="C6EFCE") green_font = Font(color="006100") blue_fill = PatternFill(bgColor="9ECAE1") blue_font = Font(color="08306B") orange_fill = PatternFill(bgColor="FDD0A2") orange_font = Font(color="A63603") purple_fill = PatternFill(bgColor="DADAEB") purple_font = Font(color="3F007D") # red highlight if there is any inconsistent between base and the target if base_schema is not None: col1 = column_mapping['column'] col2 = column_mapping['base_column'] ws.conditional_formatting.add( '%s2:%s%d' % (col1, col1, ws.max_row), FormulaRule(formula=['%s2<>%s2' % (col1, col2)], stopIfTrue=True, fill=red_fill, font=red_font)) ws.conditional_formatting.add( '%s2:%s%d' % (col2, col2, ws.max_row), FormulaRule(formula=['%s2<>%s2' % (col1, col2)], stopIfTrue=True, fill=red_fill, font=red_font)) col1 = column_mapping['type'] col2 = column_mapping['base_type'] ws.conditional_formatting.add( '%s2:%s%d' % (col1, col1, ws.max_row), FormulaRule(formula=['%s2<>%s2' % (col1, col2)], stopIfTrue=True, fill=red_fill, font=red_font)) ws.conditional_formatting.add( '%s2:%s%d' % (col2, col2, ws.max_row), FormulaRule(formula=['%s2<>%s2' % (col1, col2)], stopIfTrue=True, fill=red_fill, font=red_font)) # yellow hightlight column type (which need to be modified) ws['%s1' % (column_mapping['type'])].style = 'Neutral' # green highlight for the mkey type and red highlight for the error type type_cols = [column_mapping['type']] if 'base_type' in column_mapping.keys(): type_cols.append(column_mapping['base_type']) for col in type_cols: ws.conditional_formatting.add( '%s2:%s%d' % (col, col, ws.max_row), FormulaRule(formula=['%s2="error"' % (col)], stopIfTrue=True, fill=red_fill, font=red_font)) ws.conditional_formatting.add( '%s2:%s%d' % (col, col, ws.max_row), FormulaRule(formula=['%s2="key"' % (col)], stopIfTrue=True, fill=green_fill, font=green_font)) ws.conditional_formatting.add( '%s2:%s%d' % (col, col, ws.max_row), FormulaRule(formula=['%s2="numeric"' % (col)], stopIfTrue=True, fill=blue_fill, font=blue_font)) ws.conditional_formatting.add( '%s2:%s%d' % (col, col, ws.max_row), FormulaRule(formula=['%s2="str"' % (col)], stopIfTrue=True, fill=orange_fill, font=orange_font)) ws.conditional_formatting.add( '%s2:%s%d' % (col, col, ws.max_row), FormulaRule(formula=['%s2="date"' % (col)], stopIfTrue=True, fill=purple_fill, font=purple_font)) # red highlight for sample_num_uni = 0 or 1, only one unique value ws.conditional_formatting.add( '%s2:%s%d' % (column_mapping['sample_num_uni'], column_mapping['sample_num_uni'], ws.max_row), FormulaRule(formula=['%s2=0' % (column_mapping['sample_num_uni'])], stopIfTrue=True, fill=red_fill, font=red_font)) ws.conditional_formatting.add( '%s2:%s%d' % (column_mapping['sample_num_uni'], column_mapping['sample_num_uni'], ws.max_row), FormulaRule(formula=['%s2=1' % (column_mapping['sample_num_uni'])], stopIfTrue=True, fill=red_fill, font=red_font)) # adjust the column format for the worksheet _adjust_ws(ws=ws, row_height=20) wb.save(filename=os.path.join(output_root, 'data_schema_%s.xlsx' % (fname)))
dataset["Cabin"].describe() # In[ ]: dataset["Cabin"].isnull().sum() # In[ ]: dataset["Cabin"][dataset["Cabin"].notnull()].head() # In[ ]: # Replace the Cabin number by the type of cabin 'X' if not dataset["Cabin"] = pd.Series( [i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin']]) # The first letter of the cabin indicates the Desk, i choosed to keep this information only, since it indicates the probable location of the passenger in the Titanic. # In[ ]: # Show the counts of observations in each categorical bin using bars. g = sns.countplot(dataset["Cabin"], order=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X']) # In[ ]: g = sns.catplot(y="Survived", x="Cabin", data=dataset, kind="bar",
def amortize(self, df): """ Input a day-wise sparse dataframe. Return an amortized dataframe. Parameters ---------- df : dataframe A sparse dataframe with date as its index. e.g. DATE Brent Oil Futures Historical Data - Price 2010-01-01 NaN 2010-01-02 NaN 2010-01-03 NaN 2010-01-04 80.12 2010-01-05 80.59 Par : dictionary Costomized parameters imported from 'parameters.py'. Raises ------ ValueError Raised when the amortization contains NaN. Returns ------- df : dataframe A dataframe with no NaN and date as its index. e.g. DATE Brent Oil Futures Historical Data - Price 2010-01-01 80.12 2010-01-02 80.12 2010-01-03 80.12 2010-01-04 80.12 2010-01-05 80.59 """ display, verbose = True, True if display: feature_ctr, unab_amort_list = 0, [] df = df.copy() for col in df.columns: # if verbose: # print(col) index = np.where(df[col].notnull())[0] if index.size >= 2: amortization = [df[col].iloc[index[0]]] * (index[0] - 0) for i in range(len(index) - 1): amortization.extend( np.linspace(float(df[col].iloc[index[i]]), float(df[col].iloc[index[i + 1]]), index[i + 1] - index[i], endpoint=False)) if np.any(pd.isnull(amortization)): print(i) raise ValueError(f'{col} contains NaN') amortization.extend([df[col].iloc[index[i + 1]]] * (len(df[col]) - 1 - index[i + 1] + 1)) df[col] = amortization # Make sure all values are converted into number df[col] = df[col].astype(float) if np.any(pd.isnull(df[col])): print('null', col) raise ValueError if display: feature_ctr += 1 elif index.size < 2: if display: unab_amort_list.append(col) if verbose: print(f'Unable to amortize {col}') df.drop(columns=col, inplace=True) return df
def UserTimeLine_JsonLoad(input_str, index_line): ''' ''' # main data structure, contains information from each tweet Tweet_OBJ = col.defaultdict() # tweet Tweet_OBJ['tweet_time'] = None # pd.timestamp Tweet_OBJ['tweet_id'] = None # all id are id_str Tweet_OBJ['text'] = "" # make sure text is onto a single line..... Tweet_OBJ['lang'] = "" Tweet_OBJ[ 'coordinates'] = "" # Null, or {"coordinates":[-75.14310264,40.05701649],"type":"Point"} Tweet_OBJ['reply_to_userID'] = "-1" # set of tuple(userID, userName) Tweet_OBJ['quoted_status_id'] = "-1" Tweet_OBJ[ 'in_reply_to_status_id'] = "-1" # set of tweetID (set of strings) Tweet_OBJ['retweet_count'] = 0 Tweet_OBJ['favorite_count'] = 0 # user Tweet_OBJ['user_id'] = "-1" # all id are id_str Tweet_OBJ['user_name'] = "" Tweet_OBJ['user_verified'] = False # default Tweet_OBJ['user_followers'] = 0 # default 0 Tweet_OBJ['user_friends'] = 0 Tweet_OBJ['user_favourites'] = 0 Tweet_OBJ['user_listed'] = 0 Tweet_OBJ['user_statuses'] = 0 # hash tag Tweet_OBJ['Tag'] = set() # set of strings # mentions Tweet_OBJ['mentioned_userID'] = set() # set of tuple(userID, userName) ################################################################################# # json load, extract tweet time and id_str flag_TidTimeAuthor = True # flag for tweet id, time and author try: # load json tweet_json = json.loads(input_str) except ValueError: print "Line: {}, json loads Error".format(index_line) flag_TidTimeAuthor = False else: # extract date-time from mainbody try: time_str = tweet_json['created_at'] tweet_id = tweet_json['id_str'] except ValueError: flag_TidTimeAuthor = False pass except KeyError: flag_TidTimeAuthor = False pass else: # convert to pandas timestamp try: time_dt = pd.to_datetime(time_str) if pd.isnull(time_dt): flag_TidTimeAuthor = False print "Line: {}, date-time is NaT".format(index_line) except ValueError: flag_TidTimeAuthor = False print "Line: {}, date-time convertion failed".format( index_line) pass else: # upload to RetD_TimeUserTag if flag_TidTimeAuthor: Tweet_OBJ['tweet_time'] = time_dt Tweet_OBJ['tweet_id'] = tweet_id ################################################################################# # extract user information sub-json if flag_TidTimeAuthor: try: user_json = tweet_json['user'] except ValueError: flag_TidTimeAuthor = False pass except KeyError: flag_TidTimeAuthor = False pass else: # extract user statistics try: user_id = user_json['id_str'] user_name = user_json['screen_name'] if len(user_name) > 253: user_name = user_name[:250] user_followers = user_json['followers_count'] user_friends = user_json['friends_count'] except ValueError: flag_TidTimeAuthor = False pass except KeyError: flag_TidTimeAuthor = False pass else: if flag_TidTimeAuthor: Tweet_OBJ['user_id'] = user_id Tweet_OBJ['user_name'] = user_name Tweet_OBJ['user_followers'] = user_followers Tweet_OBJ['user_friends'] = user_friends ################################################################################# # extract tweet direct information if flag_TidTimeAuthor: # extract coordinates information try: geo_json = tweet_json['coordinates'] coordinates = str(geo_json['coordinates']) except ValueError: pass except KeyError: pass except AttributeError: pass except TypeError: pass else: Tweet_OBJ['coordinates'] = coordinates # extract lang information try: lang = tweet_json['lang'] except ValueError: pass except KeyError: pass except AttributeError: pass except TypeError: pass else: Tweet_OBJ['lang'] = lang # extract retweet_count information try: retweet_count = tweet_json['retweet_count'] except ValueError: pass except KeyError: pass except AttributeError: pass except TypeError: pass else: Tweet_OBJ['retweet_count'] = retweet_count # extract favorite_count information try: favorite_count = tweet_json['favorite_count'] except ValueError: pass except KeyError: pass except AttributeError: pass except TypeError: pass else: Tweet_OBJ['favorite_count'] = favorite_count # extract reply_to_user information try: reply_userID_str = tweet_json['in_reply_to_user_id_str'] # if userID == null, raise error; if not full digit str, raise false flag_idstr = reply_userID_str.isdigit() except ValueError: pass except KeyError: pass except AttributeError: pass except TypeError: pass else: if flag_idstr == True: Tweet_OBJ['reply_to_userID'] = reply_userID_str # extract in_reply_to_status_id information try: reply_tweetID_str = tweet_json['in_reply_to_status_id_str'] # if userID == null, raise error; if not full digit str, raise false flag_idstr = reply_tweetID_str.isdigit() except ValueError: pass except KeyError: pass except AttributeError: pass except TypeError: pass else: if flag_idstr == True: Tweet_OBJ['in_reply_to_status_id'] = reply_tweetID_str # extract quoted_status_id information try: quoted_status_id = tweet_json['quoted_status_id'] # if userID == null, raise error; if not full digit str, raise false flag_idstr = quoted_status_id.isdigit() except ValueError: pass except KeyError: pass except AttributeError: pass except TypeError: pass else: if flag_idstr == True: Tweet_OBJ['quoted_status_id'] = quoted_status_id ################################################################################# # extract tags from entities if flag_TidTimeAuthor: # extract tags from entities tag_list = set([]) # eliminate repeating tags try: entities_json = tweet_json['entities'] Hashtags_json = entities_json['hashtags'] except ValueError: pass except KeyError: pass except TypeError: pass else: for entry in Hashtags_json: try: # THIS IS VERY VERY VERY IMPORTANT !!!!! tag_text = str(entry['text']).lower() if len(tag_text) > 253: tag_text = tag_text[:250] tag_list.add( tag_text) # THIS IS VERY VERY VERY IMPORTANT !!!!! # THIS IS VERY VERY VERY IMPORTANT !!!!! # MySQL cant distinguish upper and lower cases when str is used as name for table # which will result in confusion in data analysis except ValueError: pass except KeyError: pass except TypeError: pass # end of for for item in tag_list: Tweet_OBJ['Tag'].add(item) ############################################################# # extract text if flag_TidTimeAuthor: # extract date-time from mainbody try: text_str = tweet_json['text'] text_str = transASC(text_str) text_str = removeUtf(text_str) text_str = text_str.replace("'", "") text_str = parse_MultiLine_text(text_str) except ValueError: pass except KeyError: pass else: Tweet_OBJ['text'] = text_str ############################################################# # extract mentioned_userID if flag_TidTimeAuthor: # extract entities and user_mentions try: usermentions_json = entities_json['user_mentions'] except ValueError: pass except KeyError: pass except TypeError: pass else: for entry in usermentions_json: try: Tweet_OBJ['mentioned_userID'].add(entry['id_str']) except ValueError: pass except KeyError: pass except TypeError: pass ############################################################# return flag_TidTimeAuthor, Tweet_OBJ
def save_graph_xml( data, filepath=None, node_tags=settings.osm_xml_node_tags, node_attrs=settings.osm_xml_node_attrs, edge_tags=settings.osm_xml_way_tags, edge_attrs=settings.osm_xml_way_attrs, oneway=False, merge_edges=True, edge_tag_aggs=None, ): """ Save graph to disk as an OSM-formatted XML .osm file. Note: for large networks this function can take a long time to run. Before using this function, make sure you configured OSMnx as described in the example below when you created the graph. Example ------- >>> import osmnx as ox >>> utn = ox.settings.useful_tags_node >>> oxna = ox.settings.osm_xml_node_attrs >>> oxnt = ox.settings.osm_xml_node_tags >>> utw = ox.settings.useful_tags_way >>> oxwa = ox.settings.osm_xml_way_attrs >>> oxwt = ox.settings.osm_xml_way_tags >>> utn = list(set(utn + oxna + oxnt)) >>> utw = list(set(utw + oxwa + oxwt)) >>> ox.config(all_oneway=True, useful_tags_node=utn, useful_tags_way=utw) >>> G = ox.graph_from_place('Piedmont, CA, USA', network_type='drive') >>> ox.save_graph_xml(G, filepath='./data/graph1.osm') Parameters ---------- data : networkx multi(di)graph OR a length 2 iterable of nodes/edges geopandas GeoDataFrames filepath : string path to the .osm file including extension. if None, use default data folder + graph.osm node_tags : list osm node tags to include in output OSM XML node_attrs: list osm node attributes to include in output OSM XML edge_tags : list osm way tags to include in output OSM XML edge_attrs : list osm way attributes to include in output OSM XML oneway : bool the default oneway value used to fill this tag where missing merge_edges : bool if True merges graph edges such that each OSM way has one entry and one entry only in the OSM XML. Otherwise, every OSM way will have a separate entry for each node pair it contains. edge_tag_aggs : list of length-2 string tuples useful only if merge_edges is True, this argument allows the user to specify edge attributes to aggregate such that the merged OSM way entry tags accurately represent the sum total of their component edge attributes. For example, if the user wants the OSM way to have a "length" attribute, the user must specify `edge_tag_aggs=[('length', 'sum')]` in order to tell this method to aggregate the lengths of the individual component edges. Otherwise, the length attribute will simply reflect the length of the first edge associated with the way. Returns ------- None """ # default filepath if none was provided if filepath is None: filepath = os.path.join(settings.data_folder, "graph.osm") # if save folder does not already exist, create it folder, filename = os.path.split(filepath) if not folder == "" and not os.path.exists(folder): os.makedirs(folder) if not settings.all_oneway: raise UserWarning("In order for save_graph_osm to behave properly " "the graph must have been created with the " "`all_oneway` setting set to True.") try: gdf_nodes, gdf_edges = data except ValueError: gdf_nodes, gdf_edges = utils_graph.graph_to_gdfs( data, node_geometry=False, fill_edge_geometry=False) # rename columns per osm specification gdf_nodes.rename(columns={ "osmid": "id", "x": "lon", "y": "lat" }, inplace=True) if "id" in gdf_edges.columns: gdf_edges = gdf_edges[[col for col in gdf_edges if col != "id"]] if "uniqueid" in gdf_edges.columns: gdf_edges = gdf_edges.rename(columns={"uniqueid": "id"}) else: gdf_edges = gdf_edges.reset_index().rename(columns={"index": "id"}) # add default values for required attributes for table in (gdf_nodes, gdf_edges): table["uid"] = "1" table["user"] = "******" table["version"] = "1" table["changeset"] = "1" table["timestamp"] = "2017-01-01T00:00:00Z" # convert all datatypes to str gdf_nodes = gdf_nodes.applymap(str) gdf_edges = gdf_edges.applymap(str) # misc. string replacements to meet OSM XML spec if "oneway" in gdf_edges.columns: # fill blank oneway tags with default (False) gdf_edges.loc[pd.isnull(gdf_edges["oneway"]), "oneway"] = oneway gdf_edges.loc[:, "oneway"] = gdf_edges["oneway"].astype(str) gdf_edges.loc[:, "oneway"] = (gdf_edges["oneway"].str.replace( "False", "no").replace("True", "yes")) # initialize XML tree with an OSM root element then append nodes/edges root = etree.Element("osm", attrib={"version": "1", "generator": "OSMnx"}) root = _append_nodes_xml_tree(root, gdf_nodes, node_attrs, node_tags) root = _append_edges_xml_tree(root, gdf_edges, edge_attrs, edge_tags, edge_tag_aggs, merge_edges) # write to disk etree.ElementTree(root).write(filepath) utils.log(f'Saved graph as .osm file at "{filepath}"')
if __name__ == "__main__": titanic_train_data = pd.read_csv(r"data/train.csv").drop(['Ticket'], axis=1) titanic_test_data = pd.read_csv(r"data/test.csv").drop(['Ticket'], axis=1) all_data = titanic_train_data.append(titanic_test_data).drop(['Survived'], axis=1) survived_data = titanic_train_data['Survived'] freq_port = all_data.Embarked.dropna().mode()[0] all_data.loc[:, 'Embarked'] = all_data['Embarked'].fillna(freq_port) all_data = ap.correct_age(all_data) all_data.loc[:, 'Cabin'] = all_data['Cabin'].map(lambda x: 'U' if pd.isnull(x) else x[0]) all_data.Cabin.replace(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U'], [0, 0, 0, 0, 0, 0, 0, 0, 1], inplace=True) all_data = tp.preprocessing(all_data) to_predict = all_data[survived_data.shape[0]:] train_x, test_x, train_y, test_y = train_test_split( all_data[0:survived_data.shape[0]], survived_data, test_size=0.2) kfold = model_selection.KFold(n_splits=10, random_state=seed) DecisionTreeClassifierModel = DecisionTreeClassifier(max_features=10, min_samples_leaf=6, criterion='gini',
# Rename columns ra_c.rename(columns={'country': 'country_c', 'times_2014_r': 'times_2014_r_c', 'cwur_2014_r': 'cwur_2014_r_c', 'sh_2014_r': 'sh_2014_r_c'}, inplace=True) ra_s.rename(columns={'country': 'country_s', 'times_2014_r': 'times_2014_r_s', 'cwur_2014_r': 'cwur_2014_r_s', 'sh_2014_r': 'sh_2014_r_s'}, inplace=True) # Merging the data based on top 100 universities from each ranking rank_analysis_sct = pd.merge(ra_t, pd.merge(ra_c, ra_s, on = 'university_name', how = 'outer'), on = 'university_name', how = 'outer') # Ensuring country column is not blank for universities not present in all 3 rankings for i in range(len(rank_analysis_sct)): if pd.isnull(rank_analysis_sct.loc[i, 'country']): rank_analysis_sct.loc[i, 'country'] = str(rank_analysis[rank_analysis['university_name'] == rank_analysis_sct.loc[i, 'university_name']].iloc[0]['country']) # Ensuring rank column is not blank for universities not present in all 3 rankings rank_analysis_sct['times_2014_r'] = rank_analysis_sct['times_2014_r'].replace(np.nan, rank_analysis_sct['times_2014_r_c']) rank_analysis_sct['times_2014_r'] = rank_analysis_sct['times_2014_r'].replace(np.nan, rank_analysis_sct['times_2014_r_s']) rank_analysis_sct['cwur_2014_r'] = rank_analysis_sct['cwur_2014_r'].replace(np.nan, rank_analysis_sct['cwur_2014_r_c']) rank_analysis_sct['cwur_2014_r'] = rank_analysis_sct['cwur_2014_r'].replace(np.nan, rank_analysis_sct['cwur_2014_r_s']) rank_analysis_sct['sh_2014_r'] = rank_analysis_sct['sh_2014_r'].replace(np.nan, rank_analysis_sct['sh_2014_r_c']) rank_analysis_sct['sh_2014_r'] = rank_analysis_sct['sh_2014_r'].replace(np.nan, rank_analysis_sct['sh_2014_r_s']) # Replacing nan items (universities which do not exist in ranking) with rank of 700 to ensure they are at farther distance
def get_ingredient(df_tab, idx, sen_type): ingr = df_tab.loc[idx, 'ingr_' + sen_type] if pd.isnull(ingr): return None else: return ingr
def table_to_vcf(input_table_path, output_vcf_path=None): # validate args if not os.path.isfile(input_table_path): sys.exit("ERROR: %s not found" % input_table_path) # read input table. low_memory allows dtypes to be inferred t = pd.read_table(input_table_path, low_memory=False) missing_columns = {"chrom", "pos", "ref", "alt"} - set(t.columns) if missing_columns: sys.exit("ERROR: %s is missing columns: %s" % (input_table_path, str(missing_columns))) if output_vcf_path is None: output_vcf_path = input_table_path.replace(".tsv", "") + ".vcf.gz" print("Writing output to %s" % output_vcf_path) with gzopen(output_vcf_path, "w") as f: f.write("""##source=ClinVar ##INFO=<ID=MUT,Number=1,Type=String,Description="MUT"> ##INFO=<ID=MUT,Number=1,Type=String,Description="MEASURESET_ID"> ##INFO=<ID=MUT,Number=1,Type=String,Description="SYMBOL"> ##INFO=<ID=MUT,Number=1,Type=String,Description="CLINICAL_SIGNIFICANCE"> ##INFO=<ID=MUT,Number=1,Type=String,Description="REVIEW_STATUS"> ##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_SUBMITTERS"> ##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_TRAITS"> ##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_PMIDS"> ##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_PATHOGENIC"> ##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_CONFLICTED"> """) f.write("\t".join( ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]) + "\n") for i, table_row in t.iterrows(): vcf_row = [] vcf_row.append(table_row["chrom"]) vcf_row.append(table_row["pos"]) vcf_row.append('.') # ID vcf_row.append(table_row["ref"]) vcf_row.append(table_row["alt"]) vcf_row.append('.') # QUAL vcf_row.append('.') # FILTER info_field = collections.OrderedDict() # from VCF spec: # INFO - additional information: (String, no white-space, semi-colons, or equals-signs permitted; commas are # permitted only as delimiters for lists of values) INFO fields are encoded as a semicolon-separated series of short # keys with optional values in the format: <key>=<data>[,data]. for key in [ "mut", "measureset_id", "symbol", "clinical_significance", "review_status", "all_submitters", "all_traits", "all_pmids", "pathogenic", "conflicted" ]: if pd.isnull(table_row[key]): continue value = str(table_row[key]) value = re.sub('\s*[,;]\s*', '|', value) # replace , or ; with | value = value.replace("=", " eq ").replace(" ", "_") info_field[key.upper()] = value vcf_row.append(";".join( [key + "=" + value for key, value in info_field.items()])) f.write("\t".join(map(str, vcf_row)) + "\n") f.flush() print("Done")
# Read data df_og = pd.read_csv("Titanic.csv") ## Preprocess data # Set working dataframe to original dataframe df = df_og # Finder out whether there are any duplicated rows print("There are "+ str(df.duplicated().sum()) + " duplicated rows") # Remove duplicates df = df.drop_duplicates() # Find out whether there is any missing data (NaNs) print("The total missing data is \n{} ".format(pd.isnull(df).sum())) # Deal with missing data: # Remove rows with missing data #df = df.dropna(how='any',axis=0) # or # Impute values to replace NaN in in whole dataset, since no missing values in target dataframe for c in df.columns: if (pd.isnull(df[c]).sum()!=0): df[c].fillna(df[c].mean(),inplace=True) # Imputing is more accurate in our case # Drop 'useless' columns: name of passenger (meaningless), index (meaningless), and sex (represented twice) df = df.drop(['Name','Unnamed: 0','Sex'],axis=1) # Get dummy of features (categorical to numerical)
def _set_values_series(dfs): return set(dfs[~pd.isnull(dfs)])
def tolist(*args): q = [[j for j in i if not (pd.isnull(j))] for i in args] return q
def look_at_daily_bets(game_df, odds_df, date, options): """ """ # Convert the date to an epoch dt this_epch = Tutils.tme2epoch(date, "%Y%m%d") # sort by time downloaded and remove duplicates, keeping the most recent one odds_df = odds_df.sort_values("Time") recent_odds_df = odds_df.drop_duplicates( subset=(["Away_Team", "Home_Team", "Game_Time"]), keep='last') # Maps team names to dataframe of that team team_df_map = NBA_utils.make_team_df_map(game_df) all_teams = sorted(list(team_df_map.keys())) stat_map = {} # Add whatever columns we want to look at #d = 3 #r = 8 d = 3 for r in range(1, 20): pct_counts = add_col_and_print_threshold_counts( game_df, d, r, team_df_map) key = "%d %d" % (d, r) stat_map[key] = pct_counts # if options.plot_file: plot_over_under_pcts(stat_map, options.plot_file) plot_col_running_sum(game_df, "OU_HIT_3_avg_8", options.plot_file) # # Add the running 3game average to look at # num_games = 3 home_col = "HomeTeamAvg%sGames" % num_games game_df[home_col] = game_df.apply( lambda row: NBA_utils.calculate_team_avg_xdays_game_df( row['HomeTeam'], num_games, row, team_df_map[row['HomeTeam']]), axis=1) away_col = "AwayTeamAvg%sGames" % num_games game_df[away_col] = game_df.apply( lambda row: NBA_utils.calculate_team_avg_xdays_game_df( row['AwayTeam'], num_games, row, team_df_map[row['AwayTeam']]), axis=1) # # Load up model information # model = pickle.load(open(model_file, "rb")) # Load in the info file info_file = model_file.replace(".sav", ".info") info_F = open(info_file, "r") info_lines = info_F.readlines() # Get the predictors predictors = MODEL_utils.read_predictors_from_info(info_lines) training_end_date = MODEL_utils.read_end_date_from_info(info_lines) game_df = add_model_predictors(predictors, game_df, this_epch, team_df_map) game_df.to_csv("game_df.csv") # # Get the vegas lines for this date and iterate over them # day_df = recent_odds_df[recent_odds_df['GameDateEpoch'] == this_epch] for ind, row in day_df.iterrows(): print("Info on game %s at %s. Over_Under: %f" % (row['Away_Team'], row['Home_Team'], row['Over_under_VI Consensus'])) ########################################################### # # Unnecessary info print statements # # Look at over/unders for this team in either home or away games print_team_over_unders(game_df, row, 'Away_Team') print_team_over_unders(game_df, row, 'Home_Team') # Look at last 3 away games for away team print_last_x_games(game_df, this_epch, 10, 'RemappedAwayTeam', 'Away_Team', row, 'away') print_last_x_games(game_df, this_epch, 10, 'RemappedHomeTeam', 'Home_Team', row, 'home') ########################################################### # Get game_df maps for this team away_team_df = game_df[ (game_df['RemappedAwayTeam'] == row['Away_Team']) | (game_df['RemappedHomeTeam'] == row['Away_Team'])] if (len(away_team_df) == 0): print("Can't make map for team: '%s'" % row['Away_Team']) continue home_team_df = game_df[ (game_df['RemappedAwayTeam'] == row['Home_Team']) | (game_df['RemappedHomeTeam'] == row['Home_Team'])] if (len(home_team_df) == 0): print("Can't make map for team: %s" % row['Home_Team']) continue # # THIS IS THE KEY RIGHT NOW # if away_team_df.iloc[-1]['RemappedAwayTeam'] == row['Away_Team']: away_team_avg = away_team_df.iloc[-1]['AwayTeamAvg3Games'] else: away_team_avg = away_team_df.iloc[-1]['HomeTeamAvg3Games'] print("\t%s %s : %f" % (row['Away_Team'], 'TeamAvg3Games', away_team_avg)) if home_team_df.iloc[-1]['RemappedAwayTeam'] == row['Home_Team']: home_team_avg = home_team_df.iloc[-1]['AwayTeamAvg3Games'] else: home_team_avg = home_team_df.iloc[-1]['HomeTeamAvg3Games'] print("\t%s %s : %f" % (row['Home_Team'], 'TeamAvg3Games', home_team_avg)) avg_points = home_team_avg + away_team_avg print("\t%f" % (avg_points)) # # If the teams have been averaging more than 7 over the O/U, it's usually over # if avg_points - abs(row['Over_under_Open']) >= 9: print(" --YOOO BET ON THE OVER HERE: BEEN AVERAGING %d!!!" % (avg_points)) ########################################### # Add modeled_points ########################################### # Get home team modeled points ht_preds_list = [] at_preds_list = [] for p in predictors: ht_pred_col = "%s_%s" % ('HomeTeam', p) at_pred_col = "%s_%s" % ('AwayTeam', p) if p.startswith("Opp"): # # Need differen't logic for getting opponents averages # # Get home Predictor if away_team_df.iloc[-1]['RemappedHomeTeam'] == row[ 'Away_Team']: if pd.isnull(away_team_df.iloc[-1][at_pred_col]): ht_preds_list.append(np.nan) else: ht_preds_list.append( away_team_df.iloc[-1][at_pred_col]) else: if pd.isnull(away_team_df.iloc[-1][ht_pred_col]): ht_preds_list.append(np.nan) else: ht_preds_list.append( away_team_df.iloc[-1][ht_pred_col]) # Get Away predictors if home_team_df.iloc[-1]['RemappedHomeTeam'] == row[ 'Home_Team']: if pd.isnull(home_team_df.iloc[-1][at_pred_col]): at_preds_list.append(np.nan) else: at_preds_list.append( home_team_df.iloc[-1][at_pred_col]) else: if pd.isnull(home_team_df.iloc[-1][ht_pred_col]): at_preds_list.append(np.nan) else: at_preds_list.append( home_team_df.iloc[-1][ht_pred_col]) else: # Get home Predictor if home_team_df.iloc[-1]['RemappedHomeTeam'] == row[ 'Home_Team']: if pd.isnull(home_team_df.iloc[-1][ht_pred_col]): ht_preds_list.append(np.nan) else: ht_preds_list.append( home_team_df.iloc[-1][ht_pred_col]) else: if pd.isnull(home_team_df.iloc[-1][at_pred_col]): ht_preds_list.append(np.nan) else: ht_preds_list.append( home_team_df.iloc[-1][at_pred_col]) # Get Away predictors if away_team_df.iloc[-1]['RemappedHomeTeam'] == row[ 'Away_Team']: if pd.isnull(away_team_df.iloc[-1][ht_pred_col]): at_preds_list.append(np.nan) else: at_preds_list.append( away_team_df.iloc[-1][ht_pred_col]) else: if pd.isnull(away_team_df.iloc[-1][at_pred_col]): at_preds_list.append(np.nan) else: at_preds_list.append( away_team_df.iloc[-1][at_pred_col]) ht_preds = np.array(ht_preds_list) at_preds = np.array(at_preds_list) if np.nan not in ht_preds_list: ht_fcst = model.predict(ht_preds.reshape(1, -1))[0] else: ht_fcst = -9999 if np.nan not in at_preds_list: at_fcst = model.predict(at_preds.reshape(1, -1))[0] else: at_fcst = -9999 print(ht_preds) print("\tModeled HomeTeam points: %s" % ht_fcst) print(at_preds) print("\tModeled AwayTeam points: %s" % at_fcst) modeled_OU = ht_fcst + at_fcst # If the modeled OU is more than 3 under the vegas line, bet this bitch print(row['Over_under_VI Consensus']) if (((abs(row['Over_under_VI Consensus']) - modeled_OU) > 2) & ((abs(row['Over_under_VI Consensus']) - modeled_OU) < 5)): print(" --YOOO BET ON THE UNDER HERE: Model expecting: %d!!!" % (modeled_OU)) if ((modeled_OU - 9) > abs(row['Over_under_VI Consensus'])): print(" --YOOO BET ON THE OVER HERE: Model expecting: %d!!!" % (modeled_OU)) # If modeled is more than 2 under AND average is more than 2 under. Bet. #if (((abs(row['Over_under_VI Consensus'])-modeled_OU) > 2) and # ((abs(row['Over_under_VI Consensus'])-avg_points) > 2)): # print (" --YOOO BET ON THE UNDER HERE: Model expecting %d AND avg saying %d!!!" % (modeled_OU, avg_points)) print("-------------------------------------------------------")
def __init__(self): print('loading model ...') self.nlp = spacy.load('de_core_news_lg', exclude = ['ner']) self.nlp.add_pipe('ner', source = spacy.load('de_core_news_lg')) ruler = self.nlp.add_pipe('entity_ruler', before = 'ner') ruler.from_disk("data/custom.jsonl") ruler.to_disk("data/.swap/entity_ruler") additional_df = pd.read_excel(inputSheet, 'additional', usecols=['entity', 'variations', 'type']) self.additional_dict = dict() for _, r in additional_df.iterrows(): k = r['entity'] t = r['type'] self.additional_dict[k.lower()] = (k, t) if not pd.isnull(r['variations']): for v in re.split(r'\s*;\s*', r['variations']): self.additional_dict[v.lower()] = (k, t) ignore_df = pd.read_excel(inputSheet, 'ignore', usecols=['entity']) self.ignore_set = set([r['entity'].lower() for _, r in ignore_df.iterrows()]) print('parsing transcripts ...') self.timecode_pattern = re.compile(r'[\(\[].{2}:.{2}[\)\]]|[\(\[].{1,2}:.{2}:.{2}[\)\]]') self.entities_dict = dict() self.segments = [] self.entities = [] transcript_id: int = 0 transcript_path: str = None for transcript_id in range(1, 5): print(f'... {transcript_id}') transcript_path = f'data/transcript-{transcript_id}.txt' with open(transcript_path, 'r') as file: transcript = file.read() starts, segments = self.split_transcript(transcript) for i, segment in enumerate(segments): entities = self.get_entities(segment) for e in entities: if e[0] not in self.entities_dict: self.entities_dict[e[0]] = set() if e[0].lower() in self.additional_dict: # correct category? self.entities_dict[e[0]] = set([self.additional_dict[e[0].lower()][1]]) else: self.entities_dict[e[0]].add(e[1]) entities = [e[0] for e in entities] self.segments.append(Segment(transcript_id, starts[i], segment, entities)) with open(entitiesFile, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['entity', 'type']) for e in sorted(self.entities_dict.keys()): print(e) writer.writerow([e, ';'.join(sorted(list(self.entities_dict[e])))]) with open(segmentsFile, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['video', 'start', 'text', 'entities']) for s in self.segments: writer.writerow([s.video, s.start, s.text, ';'.join(sorted(s.entities))]) dfe = pd.read_csv(entitiesFile) dfe.sort_values('type', inplace=True) dfe.to_excel(extractionSheet, sheet_name='entities', index=False) dfm = pd.DataFrame() dfm = dfm.append(pd.read_excel(inputSheet, 'entities', usecols=['entity', 'variations', 'wikidata', 'sapa', 'type', 'image']), ignore_index=True) dfm = dfm.append(pd.read_excel(extractionSheet, 'entities', usecols=['entity', 'type']), ignore_index=True) dfm.head() length2 = len(dfm) dfm.sort_values('entity', inplace=True) dfm.drop_duplicates(keep='first', inplace=True, subset=['entity']) length3 = len(dfm) print(f'Total after merge: {length2} | Removed duplications: {length2 - length3}') dfm.to_excel(outputSheet, sheet_name='entities', index=False) dfi = pd.DataFrame().append(pd.read_excel(inputSheet, 'ignore', usecols=['entity'])) dfa = pd.DataFrame().append(pd.read_excel(inputSheet, 'additional', usecols=['entity', 'variations', 'type'])) with pd.ExcelWriter(outputSheet, engine='openpyxl', mode='a') as writer: dfi.to_excel(writer, sheet_name='ignore', index=False) dfa.to_excel(writer, sheet_name='additional', index=False)
def upload(df, gfile="/New Spreadsheet", wks_name=None, col_names=True, row_names=True, clean=True, credentials=None, start_cell='A1', df_size=False, new_sheet_dimensions=(1000, 100), input_option='USER_ENTERED'): ''' Upload given Pandas DataFrame to Google Drive and returns gspread Worksheet object :param df: Pandas DataFrame :param gfile: path to Google Spreadsheet or gspread ID :param wks_name: worksheet name :param col_names: passing top row to column names for Pandas DataFrame :param row_names: passing left column to row names for Pandas DataFrame :param clean: clean all data in worksheet before uploading :param credentials: provide own credentials :param start_cell: specify where to insert the DataFrame; default is A1 :param df_size: -If True and worksheet name does NOT exist, will create a new worksheet that is the size of the df; otherwise, by default, creates sheet of 1000x100 cells. -If True and worksheet does exist, will resize larger or smaller to fit new dataframe. -If False and dataframe is larger than existing sheet, will resize the sheet larger. -If False and dataframe is smaller than existing sheet, does not resize. :param new_sheet_dimensions: tuple of (row, cols) for size of a new sheet :param input_option: Determines how input data should be interpreted. (see ValueInputOption GoogleSheet API) :param conv_string: If True, converts dataframe to str before pushing to Google Sheet :type df: class 'pandas.core.frame.DataFrame' :type gfile: str :type wks_name: str :type col_names: bool :type row_names: bool :type clean: bool :type credentials: class 'oauth2client.client.OAuth2Credentials' :type start_cell: str :type df_size: bool :type new_sheet_dimensions: tuple :type conv_string: bool :returns: gspread Worksheet :rtype: class 'gspread.models.Worksheet' :Example: >>> from df2gspread import df2gspread as d2g >>> import pandas as pd >>> df = pd.DataFrame([1 2 3]) >>> wks = d2g.upload(df, wks_name='Example worksheet') >>> wks.title 'Example worksheet' ''' # access credentials credentials = get_credentials(credentials) # auth for gspread gc = gspread.authorize(credentials) try: gc.open_by_key(gfile).__repr__() gfile_id = gfile except: gfile_id = get_file_id(credentials, gfile, write_access=True) # Tuple of rows, cols in the dataframe. # If user did not explicitly specify to resize sheet to dataframe size # then for new sheets set it to new_sheet_dimensions, which is by default 1000x100 if df_size: new_sheet_dimensions = (len(df), len(df.columns)) wks = get_worksheet(gc, gfile_id, wks_name, write_access=True, new_sheet_dimensions=new_sheet_dimensions) if clean: wks = clean_worksheet(wks, gfile_id, wks_name, credentials) start_col = re.split(r'(\d+)', start_cell)[0].upper() start_row = re.split(r'(\d+)', start_cell)[1] start_row_int, start_col_int = gspread.utils.a1_to_rowcol(start_cell) # find last index and column name (A B ... Z AA AB ... AZ BA) num_rows = len(df.index) + 1 if col_names else len(df.index) last_idx_adjust = start_row_int - 1 last_idx = num_rows + last_idx_adjust num_cols = len(df.columns) + 1 if row_names else len(df.columns) last_col_adjust = start_col_int - 1 last_col_int = num_cols + last_col_adjust last_col = re.split( r'(\d+)', (gspread.utils.rowcol_to_a1(1, last_col_int)))[0].upper() # If user requested to resize sheet to fit dataframe, go ahead and # resize larger or smaller to better match new size of pandas dataframe. # Otherwise, leave it the same size unless the sheet needs to be expanded # to accomodate a larger dataframe. if df_size: wks.resize(rows=len(df.index) + col_names, cols=len(df.columns) + row_names) if len(df.index) + col_names + last_idx_adjust > wks.row_count: wks.add_rows( len(df.index) - wks.row_count + col_names + last_idx_adjust) if len(df.columns) + row_names + last_col_adjust > wks.col_count: wks.add_cols( len(df.columns) - wks.col_count + row_names + last_col_adjust) # Define first cell for rows and columns first_col = re.split(r'(\d+)', (gspread.utils.rowcol_to_a1( 1, start_col_int + 1)))[0].upper() if row_names else start_col first_row = str(start_row_int + 1) if col_names else start_row # Addition of col names if col_names: cell_list = wks.range('%s%s:%s%s' % (first_col, start_row, last_col, start_row)) for idx, cell in enumerate(cell_list): cell.value = df.columns.astype(str)[idx] wks.update_cells(cell_list) # Addition of row names if row_names: cell_list = wks.range('%s%s:%s%d' % (start_col, first_row, start_col, last_idx)) for idx, cell in enumerate(cell_list): cell.value = df.index.astype(str)[idx] wks.update_cells(cell_list) # convert df values to string # df = df.applymap(str) # Addition of cell values cell_list = wks.range('%s%s:%s%d' % (first_col, first_row, last_col, last_idx)) for j, idx in enumerate(df.index): for i, col in enumerate(df.columns.values): if not pd.isnull(df[col][idx]): cell_list[i + j * len(df.columns.values)].value = df[col][idx] wks.update_cells(cell_list, value_input_option=input_option) return wks
def test_fred_nan(self): start = datetime(2010, 1, 1) end = datetime(2013, 1, 27) df = web.DataReader("DFII5", "fred", start, end) assert pd.isnull(df.loc["2010-01-01"][0])
def strategy(sdk): ################### # 股票策略 # ################### tradeDateFlag = sdk.getGlobal('TRADEDATEFLAG') tradeDateFlag += 1 sdk.setGlobal('TRADEDATEFLAG', tradeDateFlag) if tradeDateFlag % HOLDINGPERIOD == 0: stockList = sdk.getStockList() stop = sdk.getFactorData("LZ_CN_STKA_QUOTE_TCLOSE")[-1] # 获取最近的收盘价因子矩阵 profit = np.array( sdk.getFactorData("LZ_CN_STKA_FIN_IND_EBITPS")[-21]) ###息税前利润 data_in = Factors(sdk, -21) stop_1 = data_in[-4 - 29] stop = data_in[-3 - 29] industry_new = sdk.getFactorData("LZ_CN_STKA_INDU_ZX")[-1] # dtl = SMB # lyst = [i for i in range(0,len(SMB)) if SMB[i]>np.median(SMB)+5*np.std(SMB)] # dtl[lyst] = np.median(SMB)+5*np.std(SMB) # lyst = [i for i in range(0,len(SMB)) if SMB[i]<np.median(SMB)-5*np.std(SMB)] # dtl[lyst] = np.median(SMB)-5*np.std(SMB) # lyst = [i for i in range(0,len(SMB)) if pd.isnull(SMB[i])==True] # dtl[lyst] = mean_data[industry_new[lyst]] # SMB = dtl data_mat = Factors(sdk, -21) #data_mat.append(np.array(sdk.getFactorData("LZ_CN_STKA_VAL_A_TCAP")[-21])) #data_mat.append(industry_new) for i in range(0, len(data_mat) - 30): sum_data = [float(0) for ii in range(0, 30)] num_data = [float(0) for ii in range(0, 30)] whole_sum = 0 whole_num = 0 for j in range(0, len(data_mat[i])): if not pd.isnull( data_mat[i][j] ) and data_mat[i][j] < 1.0e+20 and data_mat[i][j] > 0.001: sum_data[int(industry_new[j])] += data_mat[i][j] num_data[int(industry_new[j])] += 1 whole_sum += data_mat[i][j] whole_num += 1 if whole_num == 0: whole_num += 1 mean_data = np.array(sum_data) / np.array(num_data) for j in range(0, len(mean_data)): if pd.isnull(mean_data[j]): mean_data[j] = whole_sum / whole_num for j in range(0, len(data_mat[i])): if pd.isnull( data_mat[i] [j]) or data_mat[i][j] > 1.0e+20 or data_mat[i][j] < 0.001: data_mat[i][j] = mean_data[int(industry_new[j])] median = np.median(data_mat[i]) sd = np.std(data_mat[i]) for j in range(0, len(data_mat[i])): if data_mat[i][j] > median + 5 * sd: data_mat[i][j] = median + 5 * sd if data_mat[i][j] < median - 5 * sd: data_mat[i][j] = median - 5 * sd XX = [] X = data_mat[-2 - 29] for j in range(0, len(X)): X[j] = math.log(X[j]) # print X[j] XX.append(X) #XX.append(data_mat[-1]) for i in range(0, 30): XX.append(data_mat[-i - 1]) X = XX data_mat = data_mat[:10] X = np.transpose(X) for i in range(0, len(data_mat)): lm = linear_model.LinearRegression() lm.fit(X, data_mat[i]) data_mat[i] = data_mat[i] - lm.predict(X) data_mat[i] = data_mat[i] / len(data_mat[i]) ##归一化处理 mat_max = max(data_mat[i]) mat_min = min(data_mat[i]) for j in range(0, len(data_mat[i])): data_mat[i][j] = (data_mat[i][j] - mat_min) / (mat_max - mat_min) #print data_mat[i] data_mat = np.transpose(data_mat) new_data_mat = [] label = [] profit = np.array(stop) / np.array(stop_1) - 1 profit1 = [i for i in profit if not pd.isnull(i)] for i in range(0, len(stop_1)): if profit[i] > np.percentile(profit1, 70): label.append(1) new_data_mat.append(data_mat[i]) continue if profit[i] <= np.percentile(profit1, 30): label.append(0) new_data_mat.append(data_mat[i]) continue data_mat = new_data_mat sequ = range(0, len(data_mat)) random.shuffle(sequ) data_mat_t = [] label_t = [] for i in sequ: data_mat_t.append(data_mat[i]) label_t.append(label[i]) data_mat = data_mat_t label = label_t data_new = Factors(sdk, -1) #data_new.append(np.array(sdk.getFactorData("LZ_CN_STKA_VAL_A_TCAP")[-1])) #data_new.append(industry_new) industry_new = sdk.getFactorData("LZ_CN_STKA_INDU_ZX")[-1] for i in range(0, len(data_new) - 30): sum_data = [0 for ii in range(0, 30)] num_data = [0 for ii in range(0, 30)] whole_sum = 0 whole_num = 0 for j in range(0, len(data_new[i])): if not pd.isnull( data_new[i][j] ) and data_new[i][j] < 1.0e+20 and data_new[i][j] > 0.001: sum_data[int(industry_new[j])] += data_new[i][j] num_data[int(industry_new[j])] += 1 whole_sum += data_new[i][j] whole_num += 1 if whole_num == 0: whole_num += 1 mean_data = np.array(sum_data) / np.array(num_data) for j in range(0, len(mean_data)): if pd.isnull(mean_data[j]): mean_data[j] = whole_sum / whole_num for j in range(0, len(data_new[i])): if pd.isnull( data_new[i] [j]) or data_new[i][j] > 1.0e+20 or data_new[i][j] < 0.001: data_new[i][j] = mean_data[int(industry_new[j])] median = np.median(data_new[i]) sd = np.std(data_new[i]) for j in range(0, len(data_new[i])): if data_new[i][j] > median + 5 * sd: data_new[i][j] = median + 5 * sd if data_new[i][j] < median - 5 * sd: data_new[i][j] = median - 5 * sd XX = [] X = data_new[-2 - 29] for j in range(0, len(X)): X[j] = math.log(X[j]) # print X[j] XX.append(X) for i in range(0, 30): XX.append(data_new[-i - 1]) X = XX data_new = data_new[:10] X = np.transpose(X) for i in range(0, len(data_new)): lm = linear_model.LinearRegression() lm.fit(X, data_new[i]) data_new[i] = data_new[i] - lm.predict(X) data_new[i] = data_new[i] / len(data_new[i]) ##归一化处理 new_max = max(data_new[i]) new_min = min(data_new[i]) for j in range(0, len(data_new[i])): data_new[i][j] = (data_new[i][j] - new_min) / (new_max - new_min) #print data_new[i] data_new = np.transpose(data_new) finger = 10 ll = len(data_mat) ll = ll / finger max_score = 0 for i in range(0, 10): test_set = np.array(data_mat[ll * i:ll * (i + 1)]) test_lab = np.array(label[ll * i:ll * (i + 1)]) train_set = np.array(data_mat[:ll * i] + data_mat[ll * (i + 1):]) train_lab = np.array(label[:ll * i] + label[ll * (i + 1):]) #train_lab = np.array(train_lab) head = len(train_set[0]) #print "head=%i" %head #print "sample number=%i" %len(train_set) tail = 1 array1 = [head, head * 3, head * 10, head * 3, tail] ###网络 print "array1=%s" % array1 trainer, net = NetworkTrain(array1, train_set, train_lab) score = NetworkTest(trainer, net, train_set, train_lab) if score > max_score: max_score = score machine = trainer print "Optimized OOB Score: %f" % max_score Y_random = np.random.binomial(data_new.shape[0], 0.5, size=20000) predicted = NetworkPredict(trainer, data_new, Y_random) # Create Random Forest object #model= RandomForestClassifier(n_estimators=10) # Train the model using the training sets and check score #model.fit(data_mat, label) #Predict Output #predicted= model.predict(data_new) WholeDict = dict(zip(stockList, predicted)) stockToBuy = [] buy_sq = [] stockToSell = [] for key in WholeDict.keys(): if WholeDict[key] == 1: stockToBuy.append(key) buy_sq.append(WholeDict[key]) if WholeDict[key] != 1: stockToSell.append(key) buyDict = dict(zip(stockToBuy, buy_sq)) buyDict_Sorted = sorted(buyDict.items(), key=lambda asd: asd[1], reverse=True) stockToBuy = [] for i in range(0, len(buyDict_Sorted)): stockToBuy.append(buyDict_Sorted[i][0]) #Date = sdk.getNowDate() #sell_plan[Date] = stockToBuy #ii=0 #selldate='' #for key in sell_plan.keys(): # d1 = datetime.datetime.strptime(key, '%Y%m%d') # d2 = datetime.datetime.strptime(Date, '%Y%m%d') # if d2-d1==10: # ii=1 # buydate=key #if ii==1 : # for i in range(0,len(sell_plan[buydate])): # stockToSell.append(sell_plan[buydate][i]) stockToBuy = stockToBuy[:HOLDINGNUMBER] # 更新持仓,卖出股票池锁定 stockToSell = getPositionList(sdk) #stockToSell1 = getPositionList(sdk) #stockToSell = [val for val in stockToSell1 if val in stockToSell] # 卖出股票 quotes = sdk.getQuotes(stockToSell) stockToSell = list(set(stockToSell) & set(quotes.keys())) # 列出要卖出的股票代码和相应的可卖持仓 # print tradeDateFlag # print stockToBuy # print stockToSell # print "\n" if stockToSell != []: pass bar = {} for s in stockToSell: bar[s] = quotes[s].open position = getPositionDict(sdk) if stockToSell != []: sellStockList(sdk, stockToSell, bar) # 以开盘价卖出股票 # 更新持仓 stockPositionList = getPositionList(sdk) # 买入股票池锁定 quotes = sdk.getQuotes(stockToBuy) # 获取股票列表的盘口信息 stockToBuy = list(set(stockToBuy) & set(quotes.keys())) # 列出要买入的股票代码和相应的可卖持仓 bar = {} for s in stockToBuy: bar[s] = quotes[s].open position = getPositionDict(sdk) buyStockList(sdk, stockToBuy, bar) # 以开盘价买入股票
def custom_heuristic(file_path): ''' You are given a list of Titantic passengers and their associated information. More information about the data can be seen at the link below: http://www.kaggle.com/c/titanic-gettingStarted/data For this exercise, you need to write a custom heuristic that will take in some combination of the passenger's attributes and predict if the passenger survived the Titanic diaster. Can your custom heuristic beat 80% accuracy? The available attributes are: Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) Name Name Sex Sex Age Age SibSp Number of Siblings/Spouses Aboard Parch Number of Parents/Children Aboard Ticket Ticket Number Fare Passenger Fare Cabin Cabin Embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton) SPECIAL NOTES: Pclass is a proxy for socioeconomic status (SES) 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower Age is in years; fractional if age less than one If the age is estimated, it is in the form xx.5 With respect to the family relation variables (i.e. SibSp and Parch) some relations were ignored. The following are the definitions used for SibSp and Parch. Sibling: brother, sister, stepbrother, or stepsister of passenger aboard Titanic Spouse: husband or wife of passenger aboard Titanic (mistresses and fiancees ignored) Parent: mother or father of passenger aboard Titanic Child: son, daughter, stepson, or stepdaughter of passenger aboard Titanic Write your prediction back into the "predictions" dictionary. The key of the dictionary should be the passenger's id (which can be accessed via passenger["PassengerId"]) and the associating value should be 1 if the passenger survvied or 0 otherwise. For example, if a passenger is predicted to have survived: passenger_id = passenger['PassengerId'] predictions[passenger_id] = 1 And if a passenger is predicted to have perished in the disaster: passenger_id = passenger['PassengerId'] predictions[passenger_id] = 0 You can also look at the Titantic data that you will be working with at the link below: https://www.dropbox.com/s/r5f9aos8p9ri9sa/titanic_data.csv ''' cols = [ 'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked' ] cols.remove('PassengerId') cols.remove('Survived') cols.remove('Name') # Ticket? cols.remove('Ticket') # Bucketize age df = pandas.read_csv(file_path) df['Age'].fillna(-1, inplace=True) df['AgeBucket'] = pandas.Series('', index=df.index) for idx, row in df.iterrows(): age = row['Age'] age_bucket = 'c' if age < 18: age_bucket = 'young' elif age < 25: age_bucket = 'midyoung' elif age < 40: age_bucket = 'midmid' else: age_bucket = 'old' df.loc[idx, 'AgeBucket'] = age_bucket # After AgeBucket is added, replace the Age with Bucket cols.remove('Age') cols.append('AgeBucket') # Replace cabin with first char df['Deck'] = pandas.Series('', index=df.index) for idx, row in df.iterrows(): cabin = row['Cabin'] if not pandas.isnull(cabin): df.loc[idx, 'Deck'] = cabin[0] cols.remove('Cabin') cols.append('Deck') # Bucketize fare df['Fare'].fillna(-1, inplace=True) df['FareBucket'] = pandas.Series(0, index=df.index) for idx, row in df.iterrows(): fare = row['Fare'] # Cap to makes bucketizing look nicer if fare > 100: df.loc[idx, 'Fare'] = 100 fare_bucket = '' if fare <= 10: fare_bucket = 10 elif fare <= 20: fare_bucket = 20 elif fare <= 30: fare_bucket = 30 elif fare <= 40: fare_bucket = 40 else: fare_bucket = 100 df.loc[idx, 'FareBucket'] = fare_bucket cols.remove('Fare') cols.append('FareBucket') # Print data relations features = [] for i, coli in enumerate(cols): for j, colj in enumerate(cols): if i <= j: features.append([coli, colj]) # features.append([coli, colj, 'Sex']) features = [] #x = df[(df['Sex'] == 'male') & (df['Deck'] == 'E') & (df['AgeBucket'] == 'midmid')] survivor_threshold = 0.8 base_threshold = 0 for f in features: print "--------------------------------------------------------------" print f predictions = {} for passenger_index, passenger in df.iterrows(): key = '' for k in f: v = passenger[k] key = key + ' ' + str(v) predictions.setdefault(key, [0, 0]) predictions[key][0] += passenger['Survived'] predictions[key][1] += 1 # Print the stats for features list # print ', '.join(df.columns) # print predictions for k in sorted(predictions.keys()): v = predictions[k] survivor = 1.0 * v[0] / v[1] base = v[1] if survivor < survivor_threshold or base < base_threshold or 'female' in k: continue print '%s => %.2f (%d)' % (k, (1.0 * v[0] / v[1]), v[1]) # Observations: # PREDICTION female in [1, 2] class => 97, 92% # PREDICTION female with SbSp <= 2 [0.79, 0.75, 0.77] # PREDICTION female not from S [69%, against C, Q] # PREDICTION decks: B, D, E # TODO: fare: bucketize total_survivors = 0 predictions = {} # df = pandas.read_csv(file_path) for passenger_index, passenger in df.iterrows(): passenger_id = passenger['PassengerId'] survivor = 0 sex = passenger['Sex'] if sex == 'female': # 1 or 2nd class? if passenger['Pclass'] in [1, 2]: survivor = 1 # Embarked in 'C' (Cherbourg)? if passenger['Embarked'] == 'C': survivor = 1 if passenger['Deck'] in ['B', 'C', 'D', 'E']: survivor = 1 if passenger['FareBucket'] == 100: survivor = 1 # Bunch of findings if passenger['Pclass'] == 2 and passenger['Parch'] == 2: survivor = 1 if passenger['SibSp'] == 1 and passenger['Deck'] in ['B', 'D']: survivor = 1 if passenger['Parch'] == 2 and passenger['AgeBucket'] == 'midyoung': survivor = 1 if passenger['Embarked'] == 'C' and passenger['Deck'] == 'D': survivor = 1 if passenger['AgeBucket'] == 'midmid' and passenger['Deck'] in [ 'B', 'D' ]: survivor = 1 if passenger['Sex'] == 'male' and passenger['Deck'] in [ 'E' ] and passenger['AgeBucket'] == 'midmid': survivor = 1 predictions[passenger_id] = survivor if survivor: total_survivors = total_survivors + 1 print 'prediction rate: ', total_survivors, len(predictions) accurate = 0 for _, passenger in df.iterrows(): passenger_id = passenger['PassengerId'] prediction = predictions[passenger_id] if prediction == passenger['Survived']: accurate = accurate + 1 #survived = sum(df['Survived'] == 1) print 'accuracy: ', (1.0 * accurate / len(predictions)) return predictions #custom_heuristic('./kaggle_titanic_train.csv') # SUBMITTED CODE: # passenger_id = passenger['PassengerId'] # # # Set custom columns # passenger['Deck'] = '' # if not pandas.isnull(passenger['Cabin']): # passenger['Deck'] = passenger['Cabin'][0] # # passenger['AgeBucket'] = '' # age = passenger['Age'] # age_bucket = 'c' # if age < 18: # age_bucket = 'young' # elif age < 25: # age_bucket = 'midyoung' # elif age < 40: # age_bucket = 'midmid' # else: # age_bucket = 'old' # passenger['AgeBucket'] = age_bucket # # # survivor = 0 # # sex = passenger['Sex'] # if sex == 'female': # # 1 or 2nd class? # if passenger['Pclass'] in [1, 2]: # survivor = 1 # # Embarked in 'C' (Cherbourg)? # if passenger['Embarked'] == 'C': # survivor = 1 # if passenger['Deck'] in ['B', 'C', 'D', 'E']: # survivor = 1 # if passenger['Fare'] >= 40: # survivor = 1 # # # Bunch of findings # if passenger['Pclass'] == 2 and passenger['Parch'] == 2: # survivor = 1 # if passenger['SibSp'] == 1 and passenger['Deck'] in ['B', 'D']: # survivor = 1 # if passenger['Parch'] == 2 and passenger['AgeBucket'] == 'midyoung': # survivor = 1 # if passenger['Embarked'] == 'C' and passenger['Deck'] == 'D': # survivor = 1 # if passenger['AgeBucket'] == 'midmid' and passenger['Deck'] in ['B', 'D']: # survivor = 1 # if passenger['Sex'] == 'male' and passenger['Deck'] in ['E'] and passenger['AgeBucket'] == 'midmid': # survivor = 1 # # predictions[passenger_id] = survivor
def _initialize_custom_data(self): windfarm = self.config["array_system_design"]["location_data"] self.location_data = extract_library_specs( "cables", windfarm, file_type="csv" ) # Make sure no data is missing missing = set(self.COLUMNS).difference(self.location_data.columns) if missing: raise ValueError( f"The following columns must be included in the location data: {missing}" ) self._format_windfarm_data() # Ensure there is no missing data in required columns missing_data_cols = [ c for c in self.REQUIRED if pd.isnull(self.location_data[c]).sum() > 0 ] if missing_data_cols: raise ValueError(f"Missing data in columns: {missing_data_cols}!") # Ensure there is no missing data in optional columns missing_data_cols = [ c for c in self.OPTIONAL if ( pd.isnull(self.location_data[c]) | self.location_data[c] == 0 ).sum() > 0 ] if missing_data_cols: message = ( f"Missing data in columns {missing_data_cols}; " "all values will be calculated." ) warnings.warn(message) # Ensure the number of turbines matches what's expected if self.location_data.shape[0] != self.system.num_turbines: raise ValueError( f"The provided number of turbines ({self.location_data.shape[0]}) ", f"does not match the plant data ({self.system.num_turbines}).", ) n_coords = self.location_data.groupby( ["turbine_latitude", "turbine_longitude"] ).ngroups duplicates = self.location_data.shape[0] - n_coords if duplicates > 0: raise ValueError( f"There are {duplicates} rows with duplicate coordinates." ) # Ensure the number of turbines on a string is within the limits longest_string = self.location_data["order"].unique().size self.num_strings = self.location_data.groupby( ["substation_id", "string"] ).ngroups if longest_string > self.num_turbines_full_string: raise ValueError( "Strings can't contain more than " f"{self.num_turbines_full_string} turbines." ) else: self.num_turbines_full_string = longest_string del self.num_turbines_partial_string del self.num_partial_strings