def create_paths_models(df): # Models will be indexed in 0, 64, 128... lines = df.line.unique() jump = 64 new_df = DataFrame() for line in lines: print('line:', line) current_df = df[df.line == line] new_df = new_df.append(current_df.iloc[0]) possible_index_paths = current_df.index_path.unique() total = possible_index_paths[-1] - possible_index_paths[0] count = 0 for index_path in range(possible_index_paths[0], possible_index_paths[-1], jump): count += (1*jump) current_path = current_df[current_df.index_path == index_path] print(count/total *100) for row in current_path.iterrows(): row = row[1] if not has_distance_from_coordinate(new_df[new_df.line == line], row, distance=minimum_distance): new_df = new_df.append(row) return new_df.drop(['index_path', 'order'], axis=1)
def _get_month_day_diffs(df_symbols): symbol = df_symbols[Column.SYMBOL] df = update_dataframe(df_symbols[Column.HISTORY], symbol) if df.empty: return df df_months = DataFrame(columns=Column.ALL) for year in df[Column.YEAR].unique(): for month in df[Column.MONTH].unique(): df_month = df[ (df[Column.YEAR] == year) & (df[Column.MONTH] == month) ].copy() if df_month.empty: continue first_day = df_month[Column.DAY].min() df_month[Column.PERCENT] = ( df_month[Column.OPEN] / df_month[df_month[Column.DAY] == first_day].iloc[0][Column.OPEN] ) if ( df_month.shape[0] >= 28 - 10 ): # 28 days in shortest Feb, 10 days - weeknds max df_months = df_months.append(df_month) else: logger.debug(f"Not enough data for {symbol} in {year}.{month}") return df_months[ [Column.YEAR, Column.MONTH, Column.DAY, Column.SYMBOL, Column.PERCENT] ]
def _get_year_day_diffs(df_symbols): symbol = df_symbols[Column.SYMBOL] df = update_dataframe(df_symbols[Column.HISTORY], symbol) date_column_name = get_date_column_name(df) df_years = DataFrame(columns=df.columns) for year in df[Column.YEAR].unique(): df_year = df[df[Column.YEAR] == year].copy() if df_year.shape[0] < 150: logger.debug(f"Not enough data for {symbol} in {year}") continue first = df_year[date_column_name].min() df_year[Column.PERCENT] = ( df_year[Column.OPEN] / df_year[df_year[date_column_name] == first].iloc[0][Column.OPEN] ) assert ( df_year.shape[0] > 150 ), f"Wrong data in dataframe {df_year.shape} for year {year}" df_years = df_years.append(df_year) return df_years[ [ date_column_name, Column.YEAR, Column.MONTH, Column.SYMBOL, Column.PERCENT, ] ]
def _parseDataFromFile(self, dataFile: str) -> DataFrame: with open(dataFile, encoding="utf-8") as f: json_data = json.load(f) json_dict = json_data[0] data_field = json_dict.get("data", None) if data_field is None: _LOGGER.warning("no 'data' field found") return None # print("xxx:", data_field) ## example data # c h l o p t v # 0 418.4 419.8 418.4 419.8 418.4 1599202800 11141 # 1 419.0 419.0 418.1 418.4 419.0 1599202801 334 # 2 418.0 419.5 418.0 419.0 418.0 1599202802 130 dataFrame = DataFrame(data_field) # print( "xxx:\n", dataFrame ) apply_on_column(dataFrame, 't', convert_timestamp_datetime) if self.rangeCode != "1D": ## add recent value to range other than "1D" (current) currData = GpwCurrentStockIntradayData.DAO(self.isin) currData.dataTime = self.dataTime currWorksheet = currData.getWorksheetData() if currWorksheet is not None: lastRow = currWorksheet.iloc[-1] dataFrame = dataFrame.append(lastRow) dataFrame.reset_index(drop=True, inplace=True) return dataFrame return None
def do_crawl(ticker_list, file_path, sleep): with open(file_path+'.csv', 'w', encoding='utf-8') as f: f.write('time,code,open,high,low,close,volume\n') f.close() ohlcv_list = DataFrame(columns=['time','code','open','high','low','close','volume']) ### 초반 데이터 20개 # ticker = "KRW-BTC" # ohlcv_list = ohlcv_list.append(get_ohlcv(ticker, interval=INTERVAL, count=20)) # ohlcv_list['code'] = ticker # save_data(ohlcv_list, file_path) ### start_date = datetime.datetime.now() prev = start_date while True: delta = datetime.datetime.now() - prev during_time = datetime.datetime.now() - start_date # 1시간 후 종료 if( during_time.seconds >= DURATION_SEC): break for ticker in ticker_list: ohlcv_list = ohlcv_list.append(get_ohlcv(ticker, interval=INTERVAL, count=1)) ohlcv_list['code'] = ticker save_data(ohlcv_list, file_path) prev = datetime.datetime.now() time.sleep(sleep)
class DAO(BaseWorksheetData): """Data access object.""" def __init__(self): self.worksheet: DataFrame = None self.dataList: List[BaseWorksheetDAO] = [] self.dataList.append(GpwMainIndexesData()) self.dataList.append(GpwMacroIndexesData()) self.dataList.append(GpwSectorsIndexesData()) ## override @synchronized def loadWorksheet(self): for dataAccess in self.dataList[:-1]: dataAccess.loadWorksheet() ## set random sleep preventing "[Errno 104] Connection reset by peer" ## server seems to reset connection in case of detection of web scrapping randTime = 1.0 + random.random() time.sleep(randTime) ## last element (without sleep) dataAccess = self.dataList[-1] dataAccess.loadWorksheet() ## override def getDataFrame(self) -> DataFrame: self.worksheet = DataFrame() for dataAccess in self.dataList: dataFrame = dataAccess.getDataFrame() self.worksheet = self.worksheet.append(dataFrame) return self.worksheet
def load_papers_df(path: str = "../gradu/material/final_results.xlsx"): """Loads the paper data as a pandas dataframe""" papers = load_papers(path) df = DataFrame( columns=["number", "name", "year", "venue", "er", "vr", "sp", "pp", "op", "pep"] + list(map(lambda x: "c" + str(x), range(1, 23))) ) # Encodes classes and categories into separate columns for paper in papers: p = asdict(paper) w_classes = p["w_classes"] cats = p["categories"] for c in (x[0] for x in W_CLASSES): if c in w_classes: p[c] = 1 else: p[c] = 0 for cat in range(1, 23): if cat in cats: p["c" + str(cat)] = 1 else: p["c" + str(cat)] = 0 # Deletes unnecessary keys del p["abstract"] del p["keywords"] del p["w_classes"] del p["categories"] df = df.append(pd.Series(p), ignore_index=True) return df
def lookup_last_week_weather(look_str, weatherDF, weather_station=1): now = datetime.strptime(look_str, "%Y-%m-%d") weathers = DataFrame() for i in range(35): one_day = timedelta(days=i) now1 = now - one_day row = weatherDF[(weatherDF.Date == now1.strftime("%Y-%m-%d")) & (weatherDF.Station == weather_station)] weathers = weathers.append(row) return weathers
def create_training_input(window: WindowArgs) -> DataPair: """Returns a dataset containing a pair of pandas dataframes that can be used for supervised learning.""" df = create_grouped_dataframe(window.data_frames) x_train = DataFrame() y_train = DataFrame() for win in df.rolling(window.window_size, axis=1): if win.shape[0] == window.window_size: recent = win.head(1).index target_date = recent + pd.DateOffset(days=window.target_shift) if target_date[0] in window.target.index: win = win.reset_index(drop=True) win.index = win.index + 1 flat_win = win.stack() flat_win.index = flat_win.index.map('{0[1]}_{0[0]}'.format) x_train = x_train.append(flat_win, ignore_index=True) y_train = y_train.append( window.target.loc[target_date], ignore_index=True) return DataPair(x_train, y_train)
def _parseDataFromFile(self, dataFile: str) -> DataFrame: # _LOGGER.debug( "opening workbook: %s", dataFile ) allDataFrames = pandas.read_html(dataFile, thousands='', decimal=',', encoding='utf-8') dataFrame = DataFrame() dataFrame = dataFrame.append(allDataFrames[0]) ## realtime indexes dataFrame = dataFrame.append(allDataFrames[1]) ## main indexes convert_indexes_data(dataFrame) append_indexes_isin(dataFrame, dataFile) return dataFrame
def anteil_gemeinsamer_buchungen(self): anteil_gemeinsamer_buchungen = DataFrame() for _, row in self.content.iterrows(): einzelbuchung = DataFrame([[ row.Datum, row.Kategorie, str(row.Name) + " (noch nicht abgerechnet, von " + str(row.Person) + ")", row.Wert * 0.5, True ]], columns=('Datum', 'Kategorie', 'Name', 'Wert', 'Dynamisch')) anteil_gemeinsamer_buchungen = anteil_gemeinsamer_buchungen.append( einzelbuchung, ignore_index=True) return anteil_gemeinsamer_buchungen
def ReadStandardData(file_name): Data=DataFrame({}) f=open(file_name,'r') while True: new_line=standard_form_data._AnalyseStandardLine(f.readline()) if type(new_line) is DataFrame: Data=Data.append(new_line,ignore_index=True) elif new_line == '#': continue elif new_line==None: break f.close() return Data
def _get_quarter_diffs(df_symbols): symbol = df_symbols[Column.SYMBOL] df_history = df_symbols[Column.HISTORY] df = update_dataframe(df_history, symbol) minutes = df[Column.MINUTE].unique() assert minutes.shape[0] > 3, f"Wrong data for {symbol} {minutes}" df_days = DataFrame(columns=Column.ALL) for year in df[Column.YEAR].unique(): for week in df[Column.WEEK].unique(): for day in df[Column.DAY].unique(): for hour in df[Column.HOUR].unique(): df_hour = df[ (df[Column.YEAR] == year) & (df[Column.WEEK] == week) & (df[Column.DAY] == day) & (df[Column.HOUR] == hour) ].copy() if df_hour.empty: continue first_time = df_hour[Column.MINUTE].min() df_hour[Column.PERCENT] = ( df_hour[Column.OPEN] / df_hour[df_hour[Column.MINUTE] == first_time].iloc[0][ Column.OPEN ] ) if ( df_hour.shape[0] >= 2 ): # good data is at least 2 times per hour (9:30, 9:45) df_days = df_days.append(df_hour) else: logger.debug(f"Not enough data for {symbol} in {week} {day}") df_days = df_days[df_days[Column.MINUTE].isin(range(0, 60, 15))] df_days[Column.QUARTER] = df_days[Column.MINUTE] return df_days[ [ Column.YEAR, Column.WEEK, Column.DAY, Column.HOUR, Column.MINUTE, Column.QUARTER, Column.SYMBOL, Column.PERCENT, ] ]
def getMergeAB(A, B): new_df = DataFrame(columns=['time', 'device']) i = 0 for _, A_row in A.iterrows(): i = i + 1 for _, B_row in B.iterrows(): a_data = A_row['time'] print(a_data) b_data = B_row['device'] row = DataFrame([dict(time=a_data, device=b_data)]) new_df = new_df.append(row, ignore_index=True) #if i > 5: # break return new_df
def run(self): self.sem.acquire() while datetime.now() < self.timeout: try: # Randomy length dataframe to keep appending to df = DataFrame({'v': [self.last]}, [datetime.now()]) for i in range(random.randint(1, 10)): df = df.append(DataFrame({'v': [self.last + i]}, [datetime.now()])) self.last + i df.index.name = 'index' self.lib.append('symbol', df) assert self.last in self.lib.read('symbol').data['v'].tolist() self.last += 2 except OptimisticLockException: # Concurrent write, not successful pass
def plot_coord_num_dist_for_element_and_move_type(self, atomicNum): '''plot the distribution of coordination number for all of a given element involved in driving coordinates grouped by driving coordinate type ''' coordNumsDataFrame = DataFrame() for reaction in self.reactions: for driveCoordinate in reaction._drivingCoordinates: for mlAtom in driveCoordinate._Atoms: if mlAtom._atom.atomicnum == atomicNum: coordNumsDataFrame = coordNumsDataFrame.append(DataFrame( {'move type':[driveCoordinate._Type], 'coordination number':[mlAtom._atom.valence]}), ignore_index=True) countplot(x='coordination number', hue='move type', data=coordNumsDataFrame) plt.savefig(str(Path.home() / 'Desktop' / 'testPlot.png'))
def get(self): NUMRESULTS = 30 THRESHOLD = 80 query = request.args.get('q') choices = lookup.loc[:, 'title'].tolist() res = process.extract(query, choices, limit=NUMRESULTS, scorer=fuzz.partial_ratio) #collect required information about card and output out = DataFrame() for _, percent_match, id in res: if percent_match > THRESHOLD: card = lookup.query('id ==' + str(id)).copy() card.loc[:, 'percent_match'] = percent_match out = out.append(card) return Response(out.to_json(orient="records"), mimetype='application/json')
def average_csv_data(patients, filename, target, *data_path): data_path = data_path[0] df_list = [] for p in data_path: df = DataFrame(columns=['clip',target]) for patient in patients: d = read_csv(p + '/' + patient + target + '.csv') df = df.append(d) df_list.append(df) avg_df = DataFrame(columns=['clip', target]) avg_df['clip'] = df_list[0]['clip'] avg_df[target] = 0 for df in df_list: avg_df[target] += df[target] avg_df[target] /= 1.0 * len(df_list) with open(filename+'.csv', 'wb') as f: avg_df.to_csv(f, header=True, index=False)
def _get_hour_diffs(df_symbols): symbol = df_symbols[Column.SYMBOL] df_history = df_symbols[Column.HISTORY] df = update_dataframe(df_history, symbol) hours = df[Column.HOUR].unique() assert hours.shape[0] > 5, f"Wrong data for {symbol} {hours}" df_days = DataFrame(columns=Column.ALL) for year in df[Column.YEAR].unique(): for week in df[Column.WEEK].unique(): for day in df[Column.DAY].unique(): df_day = df[ (df[Column.YEAR] == year) & (df[Column.WEEK] == week) & (df[Column.DAY] == day) ].copy() if df_day.empty: continue first_hour = df_day[Column.HOUR].min() df_day[Column.PERCENT] = ( df_day[Column.OPEN] / df_day[df_day[Column.HOUR] == first_hour].iloc[0][Column.OPEN] ) if df_day.shape[0] >= 5: # good data is at least 5 hours per day df_days = df_days.append(df_day) else: logger.debug(f"Not enough data for {symbol} in {week} {day}") return df_days[ [ Column.YEAR, Column.WEEK, Column.DAY, Column.HOUR, Column.SYMBOL, Column.PERCENT, ] ]
def parse(): df_list = DataFrame() for url in urls: response = requests.get(url, headers=headers).text # 转化为字符串 json_str = json.loads(response) # 大title title = json_str['info']['title'] print(title) service_path = json_str['paths'] svc_dict = list() for svc, data in service_path.items(): req = data.get('post') req_method = 'post' if req == '' or req is None: req = data.get('get') req_method = 'get' if req == '' or req is None: req = data.get('put') req_method = 'put' if req == '' or req is None: req = data.get('delete') req_method = 'delete' if req is not None: body = (title, svc, req.get('summary'), req_method) svc_dict.append(body) if df_list.empty: df_list = DataFrame(svc_dict) else: df_list = df_list.append(DataFrame(svc_dict)) df_list.columns = ['title', 'url', 'description', 'method'] if os.path.exists(file_name): os.remove(file_name) df_list.to_csv('svc.csv', encoding='utf_8_sig') print("finished")
def MergeKpi(dir): filelist = list_all_files(dir) dataAll = DataFrame() for file in filelist: print('file = ' + file) name, ext = os.path.splitext(file) if ext != '.xlsx': print('ext = ' + ext) continue print(dir + '/' + file) data = pd.read_excel(dir + '/' + file) # excel文件目录 if (dataAll.empty): dataAll = data dataAll = dataAll.append(data, ignore_index=True) print(dataAll) writer = pd.ExcelWriter(dir + "/all" + '.xlsx') dataAll.to_excel(writer, index=False) writer.save() writer.close() return
def _get_monthly_diffs(df_symbols): symbol = df_symbols[Column.SYMBOL] df = update_dataframe(df_symbols[Column.HISTORY], symbol) df_months = DataFrame(columns=Column.ALL) for year in df[Column.YEAR].unique(): df_month = df[df[Column.YEAR] == year].copy() if df_month.shape[0] < 12: logger.debug(f"Not enough data for {symbol} in {year}") continue first_month = df_month[Column.MONTH].min() df_month[Column.PERCENT] = ( df_month[Column.OPEN] / df_month[df_month[Column.MONTH] == first_month].iloc[0][Column.OPEN] ) assert ( df_month.shape[0] == 12 ), f"Wrong number of month in dataframe {df_month.shape} for year {year}" df_months = df_months.append(df_month) return df_months[[Column.YEAR, Column.MONTH, Column.SYMBOL, Column.PERCENT]]
def _get_best_weekday_diffs(df_symbols): symbol = df_symbols[Column.SYMBOL] df = update_dataframe(df_symbols[Column.HISTORY], symbol) # if number of working days less than 3 - don't count number_of_good_working = 3 df_weeks = DataFrame(columns=Column.ALL) for year in df[Column.YEAR].unique(): for week in df[Column.WEEK].unique(): df_week = df[(df[Column.YEAR] == year) & (df[Column.WEEK] == week)].copy() if df_week.empty: continue days = df_week[Column.WEEKDAY].values if df_week.shape[0] < number_of_good_working: # first and last week of year might contain only 1-2 days if week not in (1, 52, 53): logger.debug( f"Not enough data for {symbol} in {year} week {week}: {days}" ) continue first_weekday = df_week[Column.WEEKDAY].min() df_week[Column.PERCENT] = ( df_week[Column.OPEN] / df_week[df_week[Column.WEEKDAY] == first_weekday].iloc[0][Column.OPEN] ) assert ( df_week.shape[0] >= number_of_good_working ), f"Wrong number of weekdays in dataframe {df_week.shape} for year {year} {week}: {days}" df_weeks = df_weeks.append(df_week) return df_weeks[ [Column.YEAR, Column.WEEK, Column.WEEKDAY, Column.SYMBOL, Column.PERCENT] ]
def resample_laser_by(df: DataFrame, by: DataFrame, depth): ''' From the given data frame compile statistics (mean, median, min, max, etc) based on the parameters. :param df1:Larger Dataframe with smaller intervals to create a compiled stat :param df2:Smaller Dataframe with larger intervals to create index of intervals :return: A list of list of CompiledStat containing the resampled statistics for the specified sample and depth by the depth interval from df2. can only have one depth matching ''' dc = FrameClass(df) dc_by = FrameClass(by) if depth: header, = process_header_str(depth) else: header, = find_match(dc, dc_by) df = df.set_index(header.name) by = by.set_index(header.name) by = by[(by.index >= min(df.index)) & (by.index <= max(df.index))] new_df = DataFrame() if by.empty: return new_df for i in range(len(by.index.tolist()) - 1): idx = df[(df.index >= by.index[i]) & (df.index <= by.index[i + 1])] new_df = new_df.append(idx.apply(lambda x: numpy.nanmean(x)), ignore_index=True) new_df = new_df.set_index(by.index[:-1]) return new_df
def _parseDataFromFile(self, dataFile: str) -> DataFrame: # _LOGGER.debug( "opening workbook: %s", dataFile ) allDataFrames = pandas.read_html(dataFile, thousands='', decimal=',', encoding='utf-8') dataFrame = DataFrame() dataFrame = dataFrame.append(allDataFrames[1]) ## country dataFrame = dataFrame.append(allDataFrames[2]) ## foreign cleanup_column(dataFrame, 'Sektor') apply_on_column(dataFrame, 'Liczba wyemitowanych akcji', convert_int) apply_on_column(dataFrame, 'Wartość rynkowa (mln zł)', convert_float) apply_on_column(dataFrame, 'Wartość księgowa (mln zł)', convert_float) apply_on_column(dataFrame, 'C/WK', convert_float) apply_on_column(dataFrame, 'C/Z', convert_float) apply_on_column(dataFrame, 'Stopa dywidendy (%)', convert_float) return dataFrame
Data=DataFrame({}) for ith,document in enumerate(input_list): if ith%100==0: print('recording %ith, total %i'%(ith,total)) spectr=ReadNMSSMToolsSpectr(document,ignore=ignore) # inNumber=re.findall(r'\d+',document)[-1] # outNumber+=1 # reNumber col_name=['No_','path'] value_row=[ith,document] for block,code_value_dict in spectr.__dict__.items(): # print(block_name) try: code_2_name=getattr(block_table,block) except AttributeError: continue else: for code,value in code_value_dict.items(): try: col_name.append(code_2_name(code)) except KeyError: raise# continue else: value_row.append(value) Data=Data.append( DataFrame(numpy.array([value_row]),columns=col_name), ignore_index=True) Data.to_csv('Data_%s.csv'%similarity)
class PandasBackend(DataBackend): _data: DataFrame _index: PandasIndex _loc: _LocIndexer _iloc: _ILocIndexer def __init__( self, data: Optional[Union(Series, DataFrame, dict[str, list])] = None, index: Optional[PandasIndex] = None, ) -> None: if data is None: self._data = DataFrame(dtype="object") elif type(data) is Series: self._data = cast(Series, data).to_frame().transpose() elif type(data) is DataFrame: self._data = DataFrame(data) elif type(data) is dict: sample_value = next(iter(data.values())) if not isinstance(sample_value, Iterable) or isinstance( sample_value, str): self._data = Series(data).to_frame().transpose() else: self._data = DataFrame(data) else: raise ValueError( f"Received unexpected value type {type(data)}: {data}") if index is None: self._data.index.name = "index" self._index = PandasIndex(self._data.index, []) else: if not isinstance(index, PandasIndex): index = PandasIndex(index) self._data.index = index._data self._index = index self._loc = _LocIndexer(self) self._iloc = _ILocIndexer(self) def is_link(self) -> bool: return False def link_token(self) -> Optional[DataToken]: return None def to_pandas(self) -> DataFrame: return self._data @property def columns(self) -> list[str]: return self._data.columns.tolist() @property def values(self) -> np.ndarray: data_values = self._data.values shape = data_values.shape if shape[1] == 1: return np.squeeze(data_values, axis=1) elif shape[0] == 1: return np.squeeze(data_values, axis=0) else: return data_values @property def dtypes(self) -> dict[str, DataType]: return { col: DataType(dtype) for col, dtype in self._data.dtypes.items() } def cast_columns(self, column_dtypes: dict[str, type]) -> PandasBackend: return PandasBackend(self._data.astype(column_dtypes, errors="ignore")) def to_dict(self) -> dict[str, any]: return self._data.to_dict("list") @property def index(self) -> Index: return self._index @property def index_name(self) -> Union[str, list[str]]: return self._data.index.name @property def loc(self: PandasBackend) -> LocIndexer[PandasBackend]: return self._loc @property def iloc(self: PandasBackend) -> ILocIndexer[PandasBackend]: return self._iloc def equals(self, other: PandasBackend) -> bool: if type(other) is not PandasBackend: return False return np.array_equal(self._data.values, other._data.values) and self._index.equals( other._index) def __eq__(self, other) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data == other def __ne__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data != other def __gt__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data > other def __ge__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data >= other def __lt__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data < other def __le__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data <= other def __len__(self) -> int: return len(self._data) def __iter__(self) -> Generator[str, None, None]: return iter(self._data) def iterrows(self) -> Generator[tuple[int, PandasBackend], None, None]: for i, row in self._data.iterrows(): yield (i, PandasBackend(row.to_frame().transpose())) def itertuples(self, ignore_index: bool = False): for values in self._data.itertuples(index=not ignore_index): yield values def __getitem__(self, item: str) -> Any: return PandasBackend(self._data[item].to_frame()) def getitems(self, items: list[str]) -> PandasBackend: return PandasBackend(self._data[items]) def getmask(self, mask: list[bool]) -> PandasBackend: return PandasBackend(self._data[mask]) def query(self, query: "Query") -> PandasBackend: from tanuki.database.adapter.query.pandas_query_compiler import PandasQueryCompiler query_compiler = PandasQueryCompiler(self._data) query = query_compiler.compile(query) return PandasBackend(self._data[query]) def __setitem__(self, items: str, value: Any) -> None: if isinstance(value, PandasBackend): value = value._data self._data[items] = value def get_index(self, index_alias: IndexAlias) -> Index: cols = [str(col) for col in index_alias.columns] new_data = self._data.set_index(cols) new_data.index.name = index_alias.name return PandasIndex(new_data.index, cols) def set_index(self, index: Union[Index, IndexAlias]) -> PandasBackend: cols = [str(col) for col in index.columns] new_data = self._data.set_index(cols) new_data.index.name = index.name new_index = PandasIndex(new_data.index, cols) return PandasBackend(new_data, new_index) def reset_index(self: PandasBackend) -> PandasBackend: new_data = self._data.reset_index(drop=True) new_data.index.name = "index" new_index = PandasIndex(new_data.index, []) return PandasBackend(new_data, new_index) def append( self: PandasBackend, new_backend: PandasBackend, ignore_index: bool = False, ) -> PandasBackend: return PandasBackend( self._data.append(new_backend._data, ignore_index=ignore_index)) def drop_indices(self: PandasBackend, indices: list[int]) -> PandasBackend: return PandasBackend(self._data.drop(indices)) @classmethod def concat( cls: type[PandasBackend], all_backends: list[PandasBackend], ignore_index: bool = False, ) -> PandasBackend: all_data = [backend._data for backend in all_backends] return PandasBackend(pd.concat(all_data, ignore_index=ignore_index)) def nunique(self) -> int: return self._data.nunique() def __str__(self) -> str: return str(self._data) def __repr__(self) -> str: return str(self)
now1 = now - one_day row = weatherDF[(weatherDF.Date == now1.strftime("%Y-%m-%d")) & (weatherDF.Station == weather_station)] weathers = weathers.append(row) return weathers def weather_data(look_str, weatherDF): features = ["Tmax","Tmin","Tavg","DewPoint", "WetBulb", "Heat","Cool","SnowFall", "PrecipTotal", "ResultSpeed"] weather_week0 = lookup_last_week_weather(look_str, weatherDF) weather_week = weather_week0[features] averagesS = weather_week.mean(0) maxs = weather_week.max(0) maxsS = pd.Series() mins = weather_week.min(0) minsS = pd.Series() for f in features: maxsS["%s_max" % f] = maxs[f] minsS["%s_min" % f] = mins[f] #datapoints = pd.concat([averagesS, maxsS, minsS]) datapoints = averagesS weather_data = DataFrame(datapoints).T weather_data["Date"] = look_str return weather_data weather_avg = DataFrame() dates = weather["Date"] for d in dates: row = weather_data(d, weather) weather_avg= weather_avg.append(row, ignore_index=True) weather_avg.to_csv(os.path.join(data_dir,'weather_info_averages5.csv'), index=False) # duplicates()
from makstat.zavod import iter_contextual_atom_data, get_metadata stream = (line.decode('cp1251').strip().encode('utf-8') for line in stdin) # tee the stream to get the metadata for title stream, stream_2 = tee(stream) title = get_metadata(stream_2)['TITLE'] df = DataFrame() for cur_data in iter_contextual_atom_data(stream): current = DataFrame.from_dict([cur_data]) df = df.append(current, ignore_index=False) index_cols = list(df.columns.values) index_cols.remove('value') df.set_index(index_cols, inplace=True) df.columns = [title] # create removable temp file for use with HDFStore tmpfile = NamedTemporaryFile().name store = HDFStore(tmpfile) store['default'] = df store.close() # put h5 file to stdout with open(tmpfile, 'rb') as f:
repos_with_kw_docker_2015_filepath = data_files_path + \ 'repos_with_docker_2015.csv' df_github_repos_with_kw_docker_2011_to_2014 = DataFrame(pandas.read_csv( repos_with_kw_docker_2011_to_2014_filepath )['repository_url']) def apiurl_to_repourl(apiurl): return apiurl.replace('api.', '').replace('repos/', '') df_repos_2015 = pandas.read_csv(repos_with_kw_docker_2015_filepath)['repo_url'] df_github_repos_with_kw_docker_2015 = DataFrame({ 'repository_url': map(apiurl_to_repourl, df_repos_2015) }) df_repo_urls_with_kw_docker_2011_to_2015 = \ df_github_repos_with_kw_docker_2011_to_2014.append( df_github_repos_with_kw_docker_2015, ignore_index=True ) def make_test_dataset(**kwargs): samplesize = kwargs['sample_size'] testdf = df_repo_urls_with_kw_docker_2011_to_2015[:samplesize] # print testdf['repository_url'].drop_duplicates().values.tolist() return testdf['repository_url'].drop_duplicates().values.tolist()
def _h_index(self): K = self.model.K topic_counts = {} for i in range(K): sys.stdout.flush() # find the words in this topic above the threshold topic_words = self.topicdf.ix[:, i] topic_words = topic_words.iloc[topic_words.nonzero()[0]] fragment_words = {} loss_words = {} for word in topic_words.index: tokens = word.split('_') word_type = tokens[0] value = tokens[1] if word_type == 'fragment': fragment_words[value] = 0 elif word_type == 'loss': loss_words[value] = 0 # find the documents in this topic above the threshold topic_docs = self.docdf.ix[i, :] topic_docs = topic_docs.iloc[topic_docs.nonzero()[0]] # handle empty topics if topic_docs.empty: topic_counts[i] = 0 else: # now find out how many of the documents in this topic actually 'cite' the words for docname in topic_docs.index: # split mz_rt_peakid string into tokens tokens = docname.split('_') peakid = int(tokens[2]) # find all the fragment peaks of this parent peak ms2_rows = self.ms2.loc[self.ms2['MSnParentPeakID']==peakid] fragment_bin_ids = ms2_rows[['fragment_bin_id']] loss_bin_ids = ms2_rows[['loss_bin_id']] # convert from pandas dataframes to list fragment_bin_ids = fragment_bin_ids.values.ravel().tolist() loss_bin_ids = loss_bin_ids.values.ravel().tolist() # this code is too slow! # count the citation numbers # for cited in fragment_bin_ids: # if cited == 'nan': # continue # else: # if cited in fragment_words: # fragment_words[cited] = fragment_words[cited] + 1 # for cited in loss_bin_ids: # if cited == 'nan': # continue # else: # if cited in loss_words: # loss_words[cited] = loss_words[cited] + 1 # convert to dictionary for quick lookup word_dict = {} for word in fragment_bin_ids: word_dict.update({word:word}) for word in loss_bin_ids: word_dict.update({word:word}) # count the citation numbers for word in fragment_words: if word in word_dict: fragment_words[word] = fragment_words[word] + 1 for word in loss_words: if word in word_dict: loss_words[word] = loss_words[word] + 1 # make a dataframe of the articles & citation counts fragment_df = DataFrame(fragment_words, index=['counts']).transpose() loss_df = DataFrame(loss_words, index=['counts']).transpose() df = fragment_df.append(loss_df) df = df.sort(['counts'], ascending=False) # compute the h-index h_index = 0 for index, row in df.iterrows(): if row['counts'] > h_index: h_index += 1 else: break print " - Mass2Motif " + str(i) + " h-index = " + str(h_index) topic_counts[i] = h_index return topic_counts
def annotateIGSeqRead(fastaFile, chain, db, noWorkers, seqsPerFile, seqType='dna', outdir="", domainSystem='imgt', stream=None): if fastaFile is None: return Counter() # Estimate the IGV diversity in a library from igblast output printto(stream, 'The IGV clones of ' + os.path.basename(fastaFile) + ' are being annotated ...') with open(fastaFile) as f: noSeqs = sum(1 for line in f if line.startswith(">")) totalFiles = int(ceil(noSeqs / seqsPerFile)) if totalFiles < noWorkers: seqsPerFile = int(noSeqs / noWorkers) if noSeqs >= noWorkers else noSeqs totalFiles = int(ceil(noSeqs / seqsPerFile)) noSplit = noSeqs <= seqsPerFile printto(stream, "\t{0:,} sequences were found to be distributed into {1:,} file(s)" .format(noSeqs, (totalFiles if not noSplit else 1))) # todo: commented out on Thu 21 Jun 2018 16:01:06 AEST by JIAHONG FONG # reason: unknown purpose - why are sequences being trimmed using the primer argument? # chagelog: 1. newFastFile = fastaFile regardless, and remove the primer CMD argument entirely # if igRep.primer > 0: # with safeOpen(fastaFile) as fp: # recordsAll = SeqIO.to_dict(SeqIO.parse(fp, 'fasta')) # records = [] # for id_ in recordsAll: # rec = recordsAll[id_] # rec.description = '' # rec.seq = rec.seq[:igRep.primer] # records.append(rec) # filesDir = os.path.join(outdir, "tmp") # newFastFile = os.path.join(filesDir, "seqs.fasta") # SeqIO.write(records, newFastFile, 'fasta') # else: # newFastFile = fastaFile newFastFile = fastaFile # if we only asked for one worker or if the sequences within the fasta file is smaller than the threshold in # in seqsPerFile, we can just analyze the file without splitting it if noWorkers == 1 or noSplit: cloneAnnot, filteredIDs = analyzeSmallFile(newFastFile, chain, db, seqType, noWorkers, outdir, domainSystem=domainSystem, stream=stream) else: # split FASTA file into smaller files prefix, ext = os.path.splitext(os.path.basename(fastaFile)) filesDir = os.path.join(outdir, "tmp") prefix = prefix[prefix.find("_R")+1:prefix.find("_R")+3] + "_" if (prefix.find("_R") != -1) else "" splitFastaFile(fastaFile, totalFiles, seqsPerFile, filesDir, prefix, ext, stream=stream) # Prepare the multiprocessing queues tasks = Queue() outcomes = Queue() exitQueue = Queue() cloneAnnot = DataFrame() filteredIDs = [] workers = [] try: # Initialize workers for _ in range(noWorkers): w = IgBlastWorker(chain, db, seqType, int(ceil(noWorkers / totalFiles)), domainSystem=domainSystem, stream=stream) w.tasksQueue = tasks w.resultsQueue = outcomes w.exitQueue = exitQueue workers.append(w) w.start() sys.stdout.flush() # initialize tasks queue with file names for i in range(totalFiles): tasks.put(os.path.join(filesDir, prefix + "part" + str(i + 1) + ext)) # Add a poison pill for each worker for _ in range(noWorkers + 10): tasks.put(None) # Wait all process workers to terminate i = 0 while i < noWorkers: m = exitQueue.get() if m == "exit": i += 1 # Collect results printto(stream, "Results are being collated from all workers ...") sys.stdout.flush() while totalFiles: outcome = outcomes.get() totalFiles -= 1 if outcome is None: continue (cloneAnnoti, fileteredIDsi) = outcome cloneAnnot = cloneAnnot.append(cloneAnnoti) filteredIDs += fileteredIDsi sys.stdout.flush() gc.collect() printto(stream, "\tResults were collated successfully.") except Exception: printto(stream, "Something went wrong during the annotation process!", LEVEL.EXCEPT) raise finally: for w in workers: w.terminate() # Clean folders to save space # TODO: remove .fasta and .out files if noSeqs > seqsPerFile and os.path.exists(filesDir + os.path.sep + prefix + "part1" + ext): map(os.remove, glob.glob(filesDir + os.path.sep + "*" + ext)) return cloneAnnot, filteredIDs
title='Please select your creative') root.destroy() # for root, dirs, files in os.walk(r''+cre_path+''): # pass files = [] for i in cre_path: files.append(i.split('/')[-1]) ##################################################### if cam_tye == '1': if cre_frmat == '1': # same+video # 复制行 new_data = DataFrame() for i in range(len(files)): new_data = new_data.append(raw_data) # Creative Type 赋值 new_data['Creative Type'] = 'Video Page Post Ad' for file_index in range(len(files)): # creative name 每step行数 = file new_data['Video File Name'][row_num * file_index:row_num * (file_index + 1)] = files[file_index] # campaign name= oldcampaign name + creatvie name new_data['Campaign Name'][row_num * file_index:row_num * ( file_index + 1)] = new_data['Campaign Name'][ row_num * file_index:row_num * (file_index + 1)] + '_' + files[file_index]
def trial_2(login_info): # Attempts to merge dataframes with two queries dsn_tns = cx_Oracle.makedsn('ora-tns-qcc1.in.qservco.com', 1521, service_name='qcc1') connection = cx_Oracle.connect(user=login_info[0], password=login_info[1], dsn=dsn_tns) print('=' * 60) print('\t - Connection established.') query1 = ''' select distinct s.sub_id, s.sub_nm, vtc.trans_dttm as latest_invoice_date, vtp.trans_dttm as last_payment_date from wasabi.v_srvc svc join wasabi.v_sub s on s.sub_id = svc.sub_id join wasabi.v_srvc_ct sct on sct.srvc_id = svc.srvc_id join wasabi.v_ct_ct_num ccn on ccn.srvc_ct_id = sct.srvc_ct_id join ( select * from wasabi.v_trans where trans_cls = 'R' and amt > 0 and trans_stat = 'I' and trans_typ in ( select trans_typ from wasabi.v_trans_typ where lower(trans_nm) like '%equip%' ) ) vtc on vtc.sub_id = s.sub_id join ( select sub_id, max(trans_dttm) trans_dttm from wasabi.v_trans where trans_cls = 'P' group by sub_id ) vtp on vtp.sub_id = s.sub_id where svc.stat = 'A' and (ccn.ct_num_typ = 21707 or ccn.ct_typ = 364) and vtp.trans_dttm between trunc(sysdate)-30 and trunc(sysdate) ''' query2 = ''' select sub_id, trans_dttm from wasabi.v_trans where trans_cls = 'P' and sub_id = :sub_id ''' df1 = pd.read_sql(query1, connection) print('\t - Read sql for df1.') df2 = DataFrame(columns=['SUB_ID', 'TRANS_DTTM']) for row in df1['SUB_ID']: df2.append(pd.read_sql(query2, connection, params={'sub_id': row}), ignore_index=True) # df2 = pd.read_sql(query2, connection, params = {'sub_id': df1['SUB_ID']} print('\t - Read sql for df2.') print(f'\t - DF1 Head:\n{df1.head}') print(f'\t - DF2 Head:\n{df2.head}') df = pd.merge(df1, df2) #, on = "sub_id") print('\t - Merged Dataframes.') print(f'\t - DF Info:\n{df.info}') print(f'\t - DF Head:\n{df.head}') filename = 'Billed_Equipment_Paid_6mo.xlsx' sheetname = 'Subscribers & Equipment' writer = pd.ExcelWriter(filename, engine='xlsxwriter') df.to_excel(writer, sheet_name=sheetname, header=False, index=False) worksheet = writer.sheets[sheetname] (max_row, max_col) = df.shape headers = [{'header': header} for header in df.columns] worksheet.add_table(0, 0, max_row - 1, max_col - 1, {'columns': headers}) writer.save() print('\t - Finished.')
ax[i][c].set_xlabel('Decision tree importances') ax[i][c].set_ylabel('Features') ax[i][c].set_title(adt_clr.columns[i] + " random_state: " + str(random[c])) f.tight_layout() f.savefig('E:\\ML_py\\decision_tree_feature_importance.png', dpi=600, format='png') f.clf() # protein CD19 i = 8 rf = RandomForestRegressor(**param) cor = [] for j in range(0, rna_log.shape[1]): cor.append( pearsonr(adt_clr[adt_clr.columns[i]], rna_log[rna_log.columns[j]])[0]) cor = DataFrame(cor) cor.index = rna_log.columns cor.columns = ['Pearson'] cor = cor.sort_values(axis=0, ascending=False, by='Pearson') cor = cor[0:20] #the top 20 genes rna_change = rna_log[cor.index] x_train, x_test, y_train, y_test = train_test_split( rna_change, adt_clr[adt_clr.columns[i]], test_size=0.3, random_state=0) rf = rf.fit(x_train.values, y_train.values) f, ax = plt.subplots(2, 2, figsize=(70, 50)) #figsize (figsize[2]*35,figsize[1]*25) importance = rf.feature_importances_ importance = DataFrame(importance)
def _h_index(self): topic_counts = {} n_topics = self.model.K for i in range(n_topics): sys.stdout.flush() # find the words in this topic above the threshold fragment_words = self._get_nonzero_words(i, self.topicdfs, 0) loss_words = self._get_nonzero_words(i, self.topicdfs, 1) # find the documents in this topic above the threshold topic_docs = self.docdf.ix[i, :] topic_docs = topic_docs.iloc[topic_docs.nonzero()[0]] # handle empty topics if topic_docs.empty: topic_counts[i] = 0 else: # now find out how many of the documents in this topic actually 'cite' the words for docname in topic_docs.index: # split mz_rt_peakid string into tokens tokens = docname.split('_') peakid = int(tokens[2]) # find all the fragment peaks of this parent peak ms2_rows = self.ms2.loc[self.ms2['MSnParentPeakID']==peakid] fragment_bin_ids = ms2_rows[['fragment_bin_id']] loss_bin_ids = ms2_rows[['loss_bin_id']] # convert from pandas dataframes to list fragment_bin_ids = fragment_bin_ids.values.ravel().tolist() loss_bin_ids = loss_bin_ids.values.ravel().tolist() # count the citation numbers for cited in fragment_bin_ids: if cited == 'nan': continue else: if cited in fragment_words: fragment_words[cited] = fragment_words[cited] + 1 for cited in loss_bin_ids: if cited == 'nan': continue else: if cited in loss_words: loss_words[cited] = loss_words[cited] + 1 # make a dataframe of the articles & citation counts fragment_df = DataFrame(fragment_words, index=['counts']).transpose() loss_df = DataFrame(loss_words, index=['counts']).transpose() df = fragment_df.append(loss_df) df = df.sort(['counts'], ascending=False) # compute the h-index h_index = 0 for index, row in df.iterrows(): if row['counts'] > h_index: h_index += 1 else: break topic_counts[i] = h_index return topic_counts
df = pd.read_excel('data/anno_xls/19monthold36monthold.xlsx', sheet_name=str(sheetid) + 'MonthOld') df.to_excel(writer, sheet_name=str(sheetid) + 'MonthOld', index=False) # print(df) writer.save() # Merge into one sheets writer = pd.ExcelWriter('data/anno_xls/onesheet.xlsx', engine='xlsxwriter') source = [] df = DataFrame() for sheetid in range(0, 18 + 1): df_read = pd.read_excel('data/anno_xls/0monthold18monthold.xlsx', sheet_name=str(sheetid) + 'MonthOld') # print(df_read) df = df.append(df_read, ignore_index=True) # print(df) for _ in range(len(df_read)): source.append(str(sheetid) + 'MonthOld.txt') for sheetid in range(19, 36 + 1): df_read = pd.read_excel('data/anno_xls/19monthold36monthold.xlsx', sheet_name=str(sheetid) + 'MonthOld') df_read = df_read.iloc[:, 0:2] # print(df_read) df = df.append(df_read, ignore_index=True) for _ in range(len(df_read)): source.append(str(sheetid) + 'MonthOld.txt') # print(source) df['Source'] = source print(df.head()) print(df.tail())
def main(mytimer: func.TimerRequest, outputBlob: func.Out[str]) -> None: logger = logging.getLogger("logger_name") logger.disabled = True blob_service_actuals = ContainerClient.from_connection_string( os.environ['Blockblob'], "actuals", logger=logger) blob_service_pickles = ContainerClient.from_connection_string( os.environ['Blockblob'], "treepickles", logger=logger) f = blob_service_actuals.download_blob("weather_predictions.json") s = blob_service_actuals.download_blob("snow_depths.json") o = blob_service_actuals.download_blob("weather_observations.json") df = pd.read_json(f.content_as_text()) dfSnow = pd.read_json(s.content_as_text()) dfObs = pd.read_json(o.content_as_text()) dfOut = DataFrame(columns=[ 'station_id', 'station_name', 'weather_forecast_ref_time', 'obs', 'day', 'interval', 'prediction_sort_counter', 'prediction_time_from', 'prediction_time_to', 'closed_prediction' ]) for station_id in df["station_id"].unique(): snow_depth = dfSnow[dfSnow['Station_id'] == station_id].reset_index().get('Snødybde').fillna(0) air_temp = dfObs[dfObs['Station_id'] == station_id].reset_index().get( 'air_temp').fillna(0) relative_humidity = dfObs[dfObs['Station_id'] == station_id].reset_index().get( 'relative_humidity').fillna(0) dew_point_temp = dfObs[dfObs['Station_id'] == station_id].reset_index( ).get('dew_point_temp').fillna(0) wind_speed = dfObs[dfObs['Station_id'] == station_id].reset_index( ).get('wind_speed').fillna(0) wind_bearing = dfObs[dfObs['Station_id'] == station_id].reset_index( ).get('wind_bearing').fillna(0) min_visibility_dist = dfObs[dfObs['Station_id'] == station_id].reset_index().get( 'min_visibility_dist').fillna(0) precipitation_intensity = dfObs[ dfObs['Station_id'] == station_id].reset_index().get( 'precipitation_intensity').fillna(0) road_temp = dfObs[dfObs['Station_id'] == station_id].reset_index().get( 'road_temp').fillna(0) for obs in range(1, 5, 1): # 4 observasjoner pr døgn for day in df["prediction_day"].unique( ): # looper gjennom predikerte dager [0,1,2] if day < 3: # tar bort værmeldinger for 3 dager frem. for interval in df["prediction_interval"].unique( ): # looper gjennom de fire tidsintervallene pr dag dfInput = DataFrame() for h in range(6, 55, 6): r = df[(df['station_id'] == station_id) & (df['prediction_hour'] == h)].copy( deep=True).reset_index() dfInput['h' + str(h) + '_air_temp'] = r['air_temp'] dfInput['h' + str(h) + '_precipitation_amount'] = r[ 'precipitation_amount'].fillna(0) dfInput['h' + str(h) + '_wind_bearing_sin'] = np.sin( 2 * np.pi * r['wind_bearing'] / 360.0) dfInput['h' + str(h) + '_wind_bearing_cos'] = np.cos( 2 * np.pi * r['wind_bearing'] / 360.0) dfInput['h' + str(h) + '_wind_speed'] = r['wind_speed'] dfInput['h' + str(h) + '_cloud_area_fraction'] = r[ 'cloud_area_fraction'] / 100 dfInput['h' + str(h) + '_air_pressure_at_sea_level'] = r[ 'air_pressure_at_sea_level'] * 100 dfInput['h' + str(h) + '_relative_humidity'] = r[ 'relative_humidity'] / 100 dfInput['air_temp'] = air_temp dfInput['relative_humidity'] = relative_humidity dfInput['dew_point_temp'] = dew_point_temp dfInput['wind_speed'] = wind_speed dfInput['min_visibility_dist'] = min_visibility_dist dfInput[ 'precipitation_intensity'] = precipitation_intensity dfInput['road_temp'] = road_temp dfInput['snow_depth'] = snow_depth dfInput['wind_bearing_sin'] = np.sin( 2 * np.pi * wind_bearing / 360.0) dfInput['wind_bearing_cos'] = np.cos( 2 * np.pi * wind_bearing / 360.0) dfInput['day_this_winter'] = datetime.today( ).timetuple().tm_yday - 274 if 0 < datetime.today( ).timetuple().tm_yday - 274 < 365 else datetime.today( ).timetuple().tm_yday + 91 pic = blob_service_pickles.download_blob( build_file_name(station_id, obs, day, interval)).readall() model = RandomForestClassifier() model = pickle.loads(pic) ynew = model.predict_proba(dfInput.values) obs_now = math.floor(datetime.now().hour / 6) + 1 prediction_time_from = datetime.now().replace( hour=0, minute=0, second=0, microsecond=0) + timedelta(hours=int((day * 24) + ( (interval - 1) * 6))) prediction_time_to = prediction_time_from + timedelta( hours=6) if (obs == obs_now) and (prediction_time_from > datetime.now()): dfOut = dfOut.append( { 'station_id': station_id, 'station_name': road_stations.get(station_id), 'weather_forecast_ref_time': datetime_from_utc_to_local( parse(r['forecast_ref_time_zulu'][0])). strftime("%d.%m.%Y %H:%M:%S"), 'obs': obs, 'day': day, 'interval': interval, 'prediction_sort_counter': prediction_time_from, 'prediction_time_from': prediction_time_from.strftime( "%d.%m.%Y %H:%M:%S"), 'prediction_time_to': prediction_time_to.strftime( "%d.%m.%Y %H:%M:%S"), 'closed_prediction': ynew[0][1] }, ignore_index=True) dfOut = dfOut.sort_values(by=['station_id', 'obs', 'day', 'interval']) outputBlob.set( dfOut.to_json(orient='records', force_ascii=False, indent=2, index=True))