示例#1
0
def create_paths_models(df):

	# Models will be indexed in 0, 64, 128...

	lines = df.line.unique()
	jump = 64

	new_df = DataFrame()

	for line in lines:
		print('line:', line)
		current_df = df[df.line == line]

		new_df = new_df.append(current_df.iloc[0])

		possible_index_paths = current_df.index_path.unique()
		total = possible_index_paths[-1] - possible_index_paths[0]
		count = 0
		for index_path in range(possible_index_paths[0], possible_index_paths[-1], jump):
			
			count += (1*jump)
			current_path = current_df[current_df.index_path == index_path]
			print(count/total *100)
			for row in current_path.iterrows():
				row = row[1]
				if not has_distance_from_coordinate(new_df[new_df.line == line], row, distance=minimum_distance):
					new_df = new_df.append(row)
					

	return new_df.drop(['index_path', 'order'], axis=1)
示例#2
0
def _get_month_day_diffs(df_symbols):
    symbol = df_symbols[Column.SYMBOL]
    df = update_dataframe(df_symbols[Column.HISTORY], symbol)

    if df.empty:
        return df

    df_months = DataFrame(columns=Column.ALL)
    for year in df[Column.YEAR].unique():
        for month in df[Column.MONTH].unique():
            df_month = df[
                (df[Column.YEAR] == year) & (df[Column.MONTH] == month)
            ].copy()
            if df_month.empty:
                continue
            first_day = df_month[Column.DAY].min()
            df_month[Column.PERCENT] = (
                df_month[Column.OPEN]
                / df_month[df_month[Column.DAY] == first_day].iloc[0][Column.OPEN]
            )
            if (
                df_month.shape[0] >= 28 - 10
            ):  # 28 days in shortest Feb, 10 days - weeknds max
                df_months = df_months.append(df_month)
            else:
                logger.debug(f"Not enough data for {symbol} in {year}.{month}")

    return df_months[
        [Column.YEAR, Column.MONTH, Column.DAY, Column.SYMBOL, Column.PERCENT]
    ]
示例#3
0
def _get_year_day_diffs(df_symbols):
    symbol = df_symbols[Column.SYMBOL]
    df = update_dataframe(df_symbols[Column.HISTORY], symbol)
    date_column_name = get_date_column_name(df)

    df_years = DataFrame(columns=df.columns)
    for year in df[Column.YEAR].unique():
        df_year = df[df[Column.YEAR] == year].copy()
        if df_year.shape[0] < 150:
            logger.debug(f"Not enough data for {symbol} in {year}")
            continue

        first = df_year[date_column_name].min()
        df_year[Column.PERCENT] = (
            df_year[Column.OPEN]
            / df_year[df_year[date_column_name] == first].iloc[0][Column.OPEN]
        )
        assert (
            df_year.shape[0] > 150
        ), f"Wrong data in dataframe {df_year.shape} for year {year}"

        df_years = df_years.append(df_year)

    return df_years[
        [
            date_column_name,
            Column.YEAR,
            Column.MONTH,
            Column.SYMBOL,
            Column.PERCENT,
        ]
    ]
示例#4
0
        def _parseDataFromFile(self, dataFile: str) -> DataFrame:
            with open(dataFile, encoding="utf-8") as f:
                json_data = json.load(f)
                json_dict = json_data[0]
                data_field = json_dict.get("data", None)
                if data_field is None:
                    _LOGGER.warning("no 'data' field found")
                    return None
    #             print("xxx:", data_field)

    ## example data
    #            c      h      l      o      p           t      v
    # 0     418.4  419.8  418.4  419.8  418.4  1599202800  11141
    # 1     419.0  419.0  418.1  418.4  419.0  1599202801    334
    # 2     418.0  419.5  418.0  419.0  418.0  1599202802    130

                dataFrame = DataFrame(data_field)
                #             print( "xxx:\n", dataFrame )

                apply_on_column(dataFrame, 't', convert_timestamp_datetime)

                if self.rangeCode != "1D":
                    ## add recent value to range other than "1D" (current)
                    currData = GpwCurrentStockIntradayData.DAO(self.isin)
                    currData.dataTime = self.dataTime
                    currWorksheet = currData.getWorksheetData()
                    if currWorksheet is not None:
                        lastRow = currWorksheet.iloc[-1]
                        dataFrame = dataFrame.append(lastRow)
                        dataFrame.reset_index(drop=True, inplace=True)

                return dataFrame

            return None
示例#5
0
def do_crawl(ticker_list, file_path, sleep):
    with open(file_path+'.csv', 'w', encoding='utf-8') as f:
        f.write('time,code,open,high,low,close,volume\n')
    f.close()



    ohlcv_list = DataFrame(columns=['time','code','open','high','low','close','volume'])
    ### 초반 데이터 20개
    # ticker = "KRW-BTC"
    # ohlcv_list = ohlcv_list.append(get_ohlcv(ticker, interval=INTERVAL, count=20))
    # ohlcv_list['code'] = ticker
    # save_data(ohlcv_list, file_path)
    ###

    start_date = datetime.datetime.now()
    prev = start_date
    while True: 
        delta = datetime.datetime.now() - prev

        during_time = datetime.datetime.now() - start_date
        
        # 1시간 후 종료
        if( during_time.seconds >= DURATION_SEC):
            break
        

        for ticker in ticker_list:
            ohlcv_list = ohlcv_list.append(get_ohlcv(ticker, interval=INTERVAL, count=1))
            ohlcv_list['code'] = ticker
            save_data(ohlcv_list, file_path)

        prev = datetime.datetime.now()
        time.sleep(sleep)
示例#6
0
    class DAO(BaseWorksheetData):
        """Data access object."""
        def __init__(self):
            self.worksheet: DataFrame = None
            self.dataList: List[BaseWorksheetDAO] = []
            self.dataList.append(GpwMainIndexesData())
            self.dataList.append(GpwMacroIndexesData())
            self.dataList.append(GpwSectorsIndexesData())

        ## override
        @synchronized
        def loadWorksheet(self):
            for dataAccess in self.dataList[:-1]:
                dataAccess.loadWorksheet()
                ## set random sleep preventing "[Errno 104] Connection reset by peer"
                ## server seems to reset connection in case of detection of web scrapping
                randTime = 1.0 + random.random()
                time.sleep(randTime)
            ## last element (without sleep)
            dataAccess = self.dataList[-1]
            dataAccess.loadWorksheet()

        ## override
        def getDataFrame(self) -> DataFrame:
            self.worksheet = DataFrame()
            for dataAccess in self.dataList:
                dataFrame = dataAccess.getDataFrame()
                self.worksheet = self.worksheet.append(dataFrame)
            return self.worksheet
示例#7
0
def load_papers_df(path: str = "../gradu/material/final_results.xlsx"):
    """Loads the paper data as a pandas dataframe"""
    papers = load_papers(path)
    df = DataFrame(
        columns=["number", "name", "year", "venue", "er", "vr", "sp", "pp", "op", "pep"]
        + list(map(lambda x: "c" + str(x), range(1, 23)))
    )
    # Encodes classes and categories into separate columns
    for paper in papers:
        p = asdict(paper)
        w_classes = p["w_classes"]
        cats = p["categories"]
        for c in (x[0] for x in W_CLASSES):
            if c in w_classes:
                p[c] = 1
            else:
                p[c] = 0
        for cat in range(1, 23):
            if cat in cats:
                p["c" + str(cat)] = 1
            else:
                p["c" + str(cat)] = 0

        # Deletes unnecessary keys
        del p["abstract"]
        del p["keywords"]
        del p["w_classes"]
        del p["categories"]
        df = df.append(pd.Series(p), ignore_index=True)
    return df
示例#8
0
def lookup_last_week_weather(look_str, weatherDF, weather_station=1):
    now = datetime.strptime(look_str, "%Y-%m-%d")
    weathers = DataFrame()
    for i in range(35):
        one_day = timedelta(days=i)
        now1 = now - one_day
        row = weatherDF[(weatherDF.Date == now1.strftime("%Y-%m-%d")) & (weatherDF.Station == weather_station)]
        weathers = weathers.append(row)
    return weathers
def create_training_input(window: WindowArgs) -> DataPair:
    """Returns a dataset containing a pair of pandas dataframes that can
    be used for supervised learning."""
    df = create_grouped_dataframe(window.data_frames)
    x_train = DataFrame()
    y_train = DataFrame()
    for win in df.rolling(window.window_size, axis=1):
        if win.shape[0] == window.window_size:
            recent = win.head(1).index
            target_date = recent + pd.DateOffset(days=window.target_shift)
            if target_date[0] in window.target.index:
                win = win.reset_index(drop=True)
                win.index = win.index + 1
                flat_win = win.stack()
                flat_win.index = flat_win.index.map('{0[1]}_{0[0]}'.format)
                x_train = x_train.append(flat_win, ignore_index=True)
                y_train = y_train.append(
                    window.target.loc[target_date], ignore_index=True)
    return DataPair(x_train, y_train)
示例#10
0
 def _parseDataFromFile(self, dataFile: str) -> DataFrame:
     #             _LOGGER.debug( "opening workbook: %s", dataFile )
     allDataFrames = pandas.read_html(dataFile,
                                      thousands='',
                                      decimal=',',
                                      encoding='utf-8')
     dataFrame = DataFrame()
     dataFrame = dataFrame.append(allDataFrames[0])  ## realtime indexes
     dataFrame = dataFrame.append(allDataFrames[1])  ## main indexes
     convert_indexes_data(dataFrame)
     append_indexes_isin(dataFrame, dataFile)
     return dataFrame
示例#11
0
 def anteil_gemeinsamer_buchungen(self):
     anteil_gemeinsamer_buchungen = DataFrame()
     for _, row in self.content.iterrows():
         einzelbuchung = DataFrame([[
             row.Datum, row.Kategorie,
             str(row.Name) + " (noch nicht abgerechnet, von " +
             str(row.Person) + ")", row.Wert * 0.5, True
         ]],
                                   columns=('Datum', 'Kategorie', 'Name',
                                            'Wert', 'Dynamisch'))
         anteil_gemeinsamer_buchungen = anteil_gemeinsamer_buchungen.append(
             einzelbuchung, ignore_index=True)
     return anteil_gemeinsamer_buchungen
示例#12
0
 def ReadStandardData(file_name):
     Data=DataFrame({})
     f=open(file_name,'r')
     while True:
         new_line=standard_form_data._AnalyseStandardLine(f.readline())
         if type(new_line) is DataFrame:
             Data=Data.append(new_line,ignore_index=True)
         elif new_line == '#':
             continue
         elif new_line==None:
             break
     f.close()
     return Data
示例#13
0
def _get_quarter_diffs(df_symbols):
    symbol = df_symbols[Column.SYMBOL]
    df_history = df_symbols[Column.HISTORY]

    df = update_dataframe(df_history, symbol)

    minutes = df[Column.MINUTE].unique()
    assert minutes.shape[0] > 3, f"Wrong data for {symbol} {minutes}"

    df_days = DataFrame(columns=Column.ALL)
    for year in df[Column.YEAR].unique():
        for week in df[Column.WEEK].unique():
            for day in df[Column.DAY].unique():
                for hour in df[Column.HOUR].unique():
                    df_hour = df[
                        (df[Column.YEAR] == year)
                        & (df[Column.WEEK] == week)
                        & (df[Column.DAY] == day)
                        & (df[Column.HOUR] == hour)
                    ].copy()
                    if df_hour.empty:
                        continue
                    first_time = df_hour[Column.MINUTE].min()
                    df_hour[Column.PERCENT] = (
                        df_hour[Column.OPEN]
                        / df_hour[df_hour[Column.MINUTE] == first_time].iloc[0][
                            Column.OPEN
                        ]
                    )
                    if (
                        df_hour.shape[0] >= 2
                    ):  # good data is at least 2 times per hour (9:30, 9:45)
                        df_days = df_days.append(df_hour)
                    else:
                        logger.debug(f"Not enough data for {symbol} in {week} {day}")

    df_days = df_days[df_days[Column.MINUTE].isin(range(0, 60, 15))]
    df_days[Column.QUARTER] = df_days[Column.MINUTE]
    return df_days[
        [
            Column.YEAR,
            Column.WEEK,
            Column.DAY,
            Column.HOUR,
            Column.MINUTE,
            Column.QUARTER,
            Column.SYMBOL,
            Column.PERCENT,
        ]
    ]
示例#14
0
def getMergeAB(A, B):
    new_df = DataFrame(columns=['time', 'device'])
    i = 0
    for _, A_row in A.iterrows():
        i = i + 1
        for _, B_row in B.iterrows():

            a_data = A_row['time']
            print(a_data)
            b_data = B_row['device']
            row = DataFrame([dict(time=a_data, device=b_data)])
            new_df = new_df.append(row, ignore_index=True)
        #if i > 5:
        #   break
    return new_df
示例#15
0
 def run(self):
     self.sem.acquire()
     while datetime.now() < self.timeout:
         try:
             # Randomy length dataframe to keep appending to
             df = DataFrame({'v': [self.last]}, [datetime.now()])
             for i in range(random.randint(1, 10)):
                 df = df.append(DataFrame({'v': [self.last + i]}, [datetime.now()]))
             self.last + i
             df.index.name = 'index'
             self.lib.append('symbol', df)
             assert self.last in self.lib.read('symbol').data['v'].tolist()
             self.last += 2
         except OptimisticLockException:
             # Concurrent write, not successful
             pass
示例#16
0
 def run(self):
     self.sem.acquire()
     while datetime.now() < self.timeout:
         try:
             # Randomy length dataframe to keep appending to
             df = DataFrame({'v': [self.last]}, [datetime.now()])
             for i in range(random.randint(1, 10)):
                 df = df.append(DataFrame({'v': [self.last + i]}, [datetime.now()]))
             self.last + i
             df.index.name = 'index'
             self.lib.append('symbol', df)
             assert self.last in self.lib.read('symbol').data['v'].tolist()
             self.last += 2
         except OptimisticLockException:
             # Concurrent write, not successful
             pass
示例#17
0
    def plot_coord_num_dist_for_element_and_move_type(self, atomicNum):
        '''plot the distribution of coordination number for all of a given element involved in
        driving coordinates grouped by driving coordinate type
        '''

        coordNumsDataFrame = DataFrame()
        for reaction in self.reactions:
            for driveCoordinate in reaction._drivingCoordinates:
                for mlAtom in driveCoordinate._Atoms:
                    if mlAtom._atom.atomicnum == atomicNum:
                        coordNumsDataFrame = coordNumsDataFrame.append(DataFrame(
                                {'move type':[driveCoordinate._Type],
                                 'coordination number':[mlAtom._atom.valence]}),
                                ignore_index=True)

        countplot(x='coordination number', hue='move type', data=coordNumsDataFrame)
        plt.savefig(str(Path.home() / 'Desktop' / 'testPlot.png'))
示例#18
0
 def get(self):
     NUMRESULTS = 30
     THRESHOLD = 80
     query = request.args.get('q')
     choices = lookup.loc[:, 'title'].tolist()
     res = process.extract(query,
                           choices,
                           limit=NUMRESULTS,
                           scorer=fuzz.partial_ratio)
     #collect required information about card and output
     out = DataFrame()
     for _, percent_match, id in res:
         if percent_match > THRESHOLD:
             card = lookup.query('id ==' + str(id)).copy()
             card.loc[:, 'percent_match'] = percent_match
             out = out.append(card)
     return Response(out.to_json(orient="records"),
                     mimetype='application/json')
def average_csv_data(patients, filename, target, *data_path):
    data_path = data_path[0]
    df_list = []
    for p in data_path:
        df = DataFrame(columns=['clip',target])
        for patient in patients:
            d = read_csv(p + '/' + patient + target + '.csv')
            df = df.append(d)
        df_list.append(df)

    avg_df = DataFrame(columns=['clip', target])
    avg_df['clip'] = df_list[0]['clip']
    avg_df[target] = 0
    for df in df_list:
        avg_df[target] += df[target]

    avg_df[target] /= 1.0 * len(df_list)

    with open(filename+'.csv', 'wb') as f:
        avg_df.to_csv(f, header=True, index=False)
示例#20
0
def _get_hour_diffs(df_symbols):
    symbol = df_symbols[Column.SYMBOL]
    df_history = df_symbols[Column.HISTORY]

    df = update_dataframe(df_history, symbol)

    hours = df[Column.HOUR].unique()
    assert hours.shape[0] > 5, f"Wrong data for {symbol} {hours}"

    df_days = DataFrame(columns=Column.ALL)
    for year in df[Column.YEAR].unique():
        for week in df[Column.WEEK].unique():
            for day in df[Column.DAY].unique():
                df_day = df[
                    (df[Column.YEAR] == year)
                    & (df[Column.WEEK] == week)
                    & (df[Column.DAY] == day)
                ].copy()
                if df_day.empty:
                    continue
                first_hour = df_day[Column.HOUR].min()
                df_day[Column.PERCENT] = (
                    df_day[Column.OPEN]
                    / df_day[df_day[Column.HOUR] == first_hour].iloc[0][Column.OPEN]
                )
                if df_day.shape[0] >= 5:  # good data is at least 5 hours per day
                    df_days = df_days.append(df_day)
                else:
                    logger.debug(f"Not enough data for {symbol} in {week} {day}")

    return df_days[
        [
            Column.YEAR,
            Column.WEEK,
            Column.DAY,
            Column.HOUR,
            Column.SYMBOL,
            Column.PERCENT,
        ]
    ]
示例#21
0
def parse():

    df_list = DataFrame()
    for url in urls:
        response = requests.get(url, headers=headers).text
        # 转化为字符串
        json_str = json.loads(response)
        # 大title
        title = json_str['info']['title']
        print(title)
        service_path = json_str['paths']
        svc_dict = list()
        for svc, data in service_path.items():
            req = data.get('post')
            req_method = 'post'
            if req == '' or req is None:
                req = data.get('get')
                req_method = 'get'
            if req == '' or req is None:
                req = data.get('put')
                req_method = 'put'
            if req == '' or req is None:
                req = data.get('delete')
                req_method = 'delete'
            if req is not None:
                body = (title, svc, req.get('summary'), req_method)
                svc_dict.append(body)

        if df_list.empty:
            df_list = DataFrame(svc_dict)
        else:
            df_list = df_list.append(DataFrame(svc_dict))

    df_list.columns = ['title', 'url', 'description', 'method']
    if os.path.exists(file_name):
        os.remove(file_name)
    df_list.to_csv('svc.csv', encoding='utf_8_sig')

    print("finished")
示例#22
0
def MergeKpi(dir):
    filelist = list_all_files(dir)
    dataAll = DataFrame()
    for file in filelist:
        print('file = ' + file)
        name, ext = os.path.splitext(file)
        if ext != '.xlsx':
            print('ext = ' + ext)
            continue
        print(dir + '/' + file)
        data = pd.read_excel(dir + '/' + file)  # excel文件目录
        if (dataAll.empty):
            dataAll = data

        dataAll = dataAll.append(data, ignore_index=True)
        print(dataAll)

    writer = pd.ExcelWriter(dir + "/all" + '.xlsx')
    dataAll.to_excel(writer, index=False)
    writer.save()
    writer.close()

    return
示例#23
0
def _get_monthly_diffs(df_symbols):
    symbol = df_symbols[Column.SYMBOL]
    df = update_dataframe(df_symbols[Column.HISTORY], symbol)

    df_months = DataFrame(columns=Column.ALL)
    for year in df[Column.YEAR].unique():
        df_month = df[df[Column.YEAR] == year].copy()
        if df_month.shape[0] < 12:
            logger.debug(f"Not enough data for {symbol} in {year}")
            continue

        first_month = df_month[Column.MONTH].min()
        df_month[Column.PERCENT] = (
            df_month[Column.OPEN]
            / df_month[df_month[Column.MONTH] == first_month].iloc[0][Column.OPEN]
        )
        assert (
            df_month.shape[0] == 12
        ), f"Wrong number of month in dataframe {df_month.shape} for year {year}"

        df_months = df_months.append(df_month)

    return df_months[[Column.YEAR, Column.MONTH, Column.SYMBOL, Column.PERCENT]]
示例#24
0
def _get_best_weekday_diffs(df_symbols):
    symbol = df_symbols[Column.SYMBOL]
    df = update_dataframe(df_symbols[Column.HISTORY], symbol)

    # if number of working days less than 3 - don't count
    number_of_good_working = 3

    df_weeks = DataFrame(columns=Column.ALL)
    for year in df[Column.YEAR].unique():
        for week in df[Column.WEEK].unique():
            df_week = df[(df[Column.YEAR] == year) & (df[Column.WEEK] == week)].copy()
            if df_week.empty:
                continue

            days = df_week[Column.WEEKDAY].values
            if df_week.shape[0] < number_of_good_working:
                # first and last week of year might contain only 1-2 days
                if week not in (1, 52, 53):
                    logger.debug(
                        f"Not enough data for {symbol} in {year} week {week}: {days}"
                    )
                continue

            first_weekday = df_week[Column.WEEKDAY].min()
            df_week[Column.PERCENT] = (
                df_week[Column.OPEN]
                / df_week[df_week[Column.WEEKDAY] == first_weekday].iloc[0][Column.OPEN]
            )
            assert (
                df_week.shape[0] >= number_of_good_working
            ), f"Wrong number of weekdays in dataframe {df_week.shape} for year {year} {week}: {days}"

            df_weeks = df_weeks.append(df_week)

    return df_weeks[
        [Column.YEAR, Column.WEEK, Column.WEEKDAY, Column.SYMBOL, Column.PERCENT]
    ]
def resample_laser_by(df: DataFrame, by: DataFrame, depth):
    '''
    From the given data frame compile statistics (mean, median, min, max, etc)
    based on the parameters.
 
    :param df1:Larger Dataframe with smaller intervals to create a compiled stat
    :param df2:Smaller Dataframe with larger intervals to create index of intervals
    :return: A list of list of CompiledStat containing the resampled statistics for the
    specified sample and depth by the depth interval from df2.
    
    can only have one depth matching
    '''

    dc = FrameClass(df)
    dc_by = FrameClass(by)
    if depth:
        header, = process_header_str(depth)
    else:
        header, = find_match(dc, dc_by)

    df = df.set_index(header.name)
    by = by.set_index(header.name)
    by = by[(by.index >= min(df.index)) & (by.index <= max(df.index))]

    new_df = DataFrame()
    if by.empty:
        return new_df
    for i in range(len(by.index.tolist()) - 1):

        idx = df[(df.index >= by.index[i]) & (df.index <= by.index[i + 1])]

        new_df = new_df.append(idx.apply(lambda x: numpy.nanmean(x)),
                               ignore_index=True)

    new_df = new_df.set_index(by.index[:-1])

    return new_df
示例#26
0
        def _parseDataFromFile(self, dataFile: str) -> DataFrame:
            #             _LOGGER.debug( "opening workbook: %s", dataFile )
            allDataFrames = pandas.read_html(dataFile,
                                             thousands='',
                                             decimal=',',
                                             encoding='utf-8')
            dataFrame = DataFrame()
            dataFrame = dataFrame.append(allDataFrames[1])  ## country
            dataFrame = dataFrame.append(allDataFrames[2])  ## foreign

            cleanup_column(dataFrame, 'Sektor')

            apply_on_column(dataFrame, 'Liczba wyemitowanych akcji',
                            convert_int)
            apply_on_column(dataFrame, 'Wartość rynkowa (mln zł)',
                            convert_float)
            apply_on_column(dataFrame, 'Wartość księgowa (mln zł)',
                            convert_float)

            apply_on_column(dataFrame, 'C/WK', convert_float)
            apply_on_column(dataFrame, 'C/Z', convert_float)
            apply_on_column(dataFrame, 'Stopa dywidendy (%)', convert_float)

            return dataFrame
示例#27
0
Data=DataFrame({})
for ith,document in enumerate(input_list):
    if ith%100==0:
        print('recording %ith, total %i'%(ith,total))

    spectr=ReadNMSSMToolsSpectr(document,ignore=ignore)
    # inNumber=re.findall(r'\d+',document)[-1]
    # outNumber+=1   # reNumber

    col_name=['No_','path']
    value_row=[ith,document]

    for block,code_value_dict in spectr.__dict__.items():
        # print(block_name)
        try:
            code_2_name=getattr(block_table,block)
        except AttributeError:
            continue
        else:
            for code,value in code_value_dict.items():
                try:
                    col_name.append(code_2_name(code))
                except KeyError:
                    raise# continue
                else:
                    value_row.append(value)
    Data=Data.append(
        DataFrame(numpy.array([value_row]),columns=col_name),
        ignore_index=True)

Data.to_csv('Data_%s.csv'%similarity)
示例#28
0
class PandasBackend(DataBackend):
    _data: DataFrame
    _index: PandasIndex
    _loc: _LocIndexer
    _iloc: _ILocIndexer

    def __init__(
        self,
        data: Optional[Union(Series, DataFrame, dict[str, list])] = None,
        index: Optional[PandasIndex] = None,
    ) -> None:
        if data is None:
            self._data = DataFrame(dtype="object")
        elif type(data) is Series:
            self._data = cast(Series, data).to_frame().transpose()
        elif type(data) is DataFrame:
            self._data = DataFrame(data)
        elif type(data) is dict:
            sample_value = next(iter(data.values()))
            if not isinstance(sample_value, Iterable) or isinstance(
                    sample_value, str):
                self._data = Series(data).to_frame().transpose()
            else:
                self._data = DataFrame(data)
        else:
            raise ValueError(
                f"Received unexpected value type {type(data)}: {data}")

        if index is None:
            self._data.index.name = "index"
            self._index = PandasIndex(self._data.index, [])
        else:
            if not isinstance(index, PandasIndex):
                index = PandasIndex(index)
            self._data.index = index._data
            self._index = index
        self._loc = _LocIndexer(self)
        self._iloc = _ILocIndexer(self)

    def is_link(self) -> bool:
        return False

    def link_token(self) -> Optional[DataToken]:
        return None

    def to_pandas(self) -> DataFrame:
        return self._data

    @property
    def columns(self) -> list[str]:
        return self._data.columns.tolist()

    @property
    def values(self) -> np.ndarray:
        data_values = self._data.values
        shape = data_values.shape
        if shape[1] == 1:
            return np.squeeze(data_values, axis=1)
        elif shape[0] == 1:
            return np.squeeze(data_values, axis=0)
        else:
            return data_values

    @property
    def dtypes(self) -> dict[str, DataType]:
        return {
            col: DataType(dtype)
            for col, dtype in self._data.dtypes.items()
        }

    def cast_columns(self, column_dtypes: dict[str, type]) -> PandasBackend:
        return PandasBackend(self._data.astype(column_dtypes, errors="ignore"))

    def to_dict(self) -> dict[str, any]:
        return self._data.to_dict("list")

    @property
    def index(self) -> Index:
        return self._index

    @property
    def index_name(self) -> Union[str, list[str]]:
        return self._data.index.name

    @property
    def loc(self: PandasBackend) -> LocIndexer[PandasBackend]:
        return self._loc

    @property
    def iloc(self: PandasBackend) -> ILocIndexer[PandasBackend]:
        return self._iloc

    def equals(self, other: PandasBackend) -> bool:
        if type(other) is not PandasBackend:
            return False
        return np.array_equal(self._data.values,
                              other._data.values) and self._index.equals(
                                  other._index)

    def __eq__(self, other) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data == other

    def __ne__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data != other

    def __gt__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data > other

    def __ge__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data >= other

    def __lt__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data < other

    def __le__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data <= other

    def __len__(self) -> int:
        return len(self._data)

    def __iter__(self) -> Generator[str, None, None]:
        return iter(self._data)

    def iterrows(self) -> Generator[tuple[int, PandasBackend], None, None]:
        for i, row in self._data.iterrows():
            yield (i, PandasBackend(row.to_frame().transpose()))

    def itertuples(self, ignore_index: bool = False):
        for values in self._data.itertuples(index=not ignore_index):
            yield values

    def __getitem__(self, item: str) -> Any:
        return PandasBackend(self._data[item].to_frame())

    def getitems(self, items: list[str]) -> PandasBackend:
        return PandasBackend(self._data[items])

    def getmask(self, mask: list[bool]) -> PandasBackend:
        return PandasBackend(self._data[mask])

    def query(self, query: "Query") -> PandasBackend:
        from tanuki.database.adapter.query.pandas_query_compiler import PandasQueryCompiler

        query_compiler = PandasQueryCompiler(self._data)
        query = query_compiler.compile(query)
        return PandasBackend(self._data[query])

    def __setitem__(self, items: str, value: Any) -> None:
        if isinstance(value, PandasBackend):
            value = value._data
        self._data[items] = value

    def get_index(self, index_alias: IndexAlias) -> Index:
        cols = [str(col) for col in index_alias.columns]
        new_data = self._data.set_index(cols)
        new_data.index.name = index_alias.name
        return PandasIndex(new_data.index, cols)

    def set_index(self, index: Union[Index, IndexAlias]) -> PandasBackend:
        cols = [str(col) for col in index.columns]
        new_data = self._data.set_index(cols)
        new_data.index.name = index.name
        new_index = PandasIndex(new_data.index, cols)
        return PandasBackend(new_data, new_index)

    def reset_index(self: PandasBackend) -> PandasBackend:
        new_data = self._data.reset_index(drop=True)
        new_data.index.name = "index"
        new_index = PandasIndex(new_data.index, [])
        return PandasBackend(new_data, new_index)

    def append(
        self: PandasBackend,
        new_backend: PandasBackend,
        ignore_index: bool = False,
    ) -> PandasBackend:
        return PandasBackend(
            self._data.append(new_backend._data, ignore_index=ignore_index))

    def drop_indices(self: PandasBackend, indices: list[int]) -> PandasBackend:
        return PandasBackend(self._data.drop(indices))

    @classmethod
    def concat(
        cls: type[PandasBackend],
        all_backends: list[PandasBackend],
        ignore_index: bool = False,
    ) -> PandasBackend:
        all_data = [backend._data for backend in all_backends]
        return PandasBackend(pd.concat(all_data, ignore_index=ignore_index))

    def nunique(self) -> int:
        return self._data.nunique()

    def __str__(self) -> str:
        return str(self._data)

    def __repr__(self) -> str:
        return str(self)
示例#29
0
        now1 = now - one_day
        row = weatherDF[(weatherDF.Date == now1.strftime("%Y-%m-%d")) & (weatherDF.Station == weather_station)]
        weathers = weathers.append(row)
    return weathers

def weather_data(look_str, weatherDF):
    features = ["Tmax","Tmin","Tavg","DewPoint", "WetBulb", "Heat","Cool","SnowFall", "PrecipTotal", "ResultSpeed"]
    weather_week0 = lookup_last_week_weather(look_str, weatherDF)
    weather_week = weather_week0[features]
    averagesS = weather_week.mean(0)
    maxs = weather_week.max(0)
    maxsS = pd.Series()
    mins = weather_week.min(0)
    minsS = pd.Series()
    for f in features:
        maxsS["%s_max" % f] = maxs[f]
        minsS["%s_min" % f] = mins[f]
    #datapoints = pd.concat([averagesS, maxsS, minsS])
    datapoints = averagesS
    weather_data = DataFrame(datapoints).T
    weather_data["Date"] = look_str
    return weather_data
        
weather_avg = DataFrame()
dates = weather["Date"]
for d in dates:
    row = weather_data(d, weather)
    weather_avg= weather_avg.append(row, ignore_index=True)
weather_avg.to_csv(os.path.join(data_dir,'weather_info_averages5.csv'), index=False)

# duplicates()
示例#30
0
from makstat.zavod import iter_contextual_atom_data, get_metadata


stream = (line.decode('cp1251').strip().encode('utf-8')
          for line in stdin)

# tee the stream to get the metadata for title
stream, stream_2 = tee(stream)

title = get_metadata(stream_2)['TITLE']

df = DataFrame()
for cur_data in iter_contextual_atom_data(stream):
    current = DataFrame.from_dict([cur_data])
    df = df.append(current, ignore_index=False)

index_cols = list(df.columns.values)
index_cols.remove('value')
df.set_index(index_cols, inplace=True)
df.columns = [title]

# create removable temp file for use with HDFStore
tmpfile = NamedTemporaryFile().name

store = HDFStore(tmpfile)
store['default'] = df
store.close()

# put h5 file to stdout
with open(tmpfile, 'rb') as f:
repos_with_kw_docker_2015_filepath = data_files_path + \
    'repos_with_docker_2015.csv'


df_github_repos_with_kw_docker_2011_to_2014 = DataFrame(pandas.read_csv(
    repos_with_kw_docker_2011_to_2014_filepath
)['repository_url'])


def apiurl_to_repourl(apiurl):
    return apiurl.replace('api.', '').replace('repos/', '')

df_repos_2015 = pandas.read_csv(repos_with_kw_docker_2015_filepath)['repo_url']
df_github_repos_with_kw_docker_2015 = DataFrame({
    'repository_url': map(apiurl_to_repourl, df_repos_2015)
})

df_repo_urls_with_kw_docker_2011_to_2015 = \
    df_github_repos_with_kw_docker_2011_to_2014.append(
        df_github_repos_with_kw_docker_2015,
        ignore_index=True
    )


def make_test_dataset(**kwargs):
    samplesize = kwargs['sample_size']
    testdf = df_repo_urls_with_kw_docker_2011_to_2015[:samplesize]
    # print testdf['repository_url'].drop_duplicates().values.tolist()
    return testdf['repository_url'].drop_duplicates().values.tolist()
示例#32
0
    def _h_index(self):

        K = self.model.K
        topic_counts = {}

        for i in range(K):

            sys.stdout.flush()

            # find the words in this topic above the threshold
            topic_words = self.topicdf.ix[:, i]
            topic_words = topic_words.iloc[topic_words.nonzero()[0]]

            fragment_words = {}
            loss_words = {}
            for word in topic_words.index:
                tokens = word.split('_')
                word_type = tokens[0]
                value = tokens[1]
                if word_type == 'fragment':
                    fragment_words[value] = 0
                elif word_type == 'loss':
                    loss_words[value] = 0

            # find the documents in this topic above the threshold
            topic_docs = self.docdf.ix[i, :]
            topic_docs = topic_docs.iloc[topic_docs.nonzero()[0]]

            # handle empty topics
            if topic_docs.empty:

                topic_counts[i] = 0

            else:

                # now find out how many of the documents in this topic actually 'cite' the words
                for docname in topic_docs.index:

                    # split mz_rt_peakid string into tokens
                    tokens = docname.split('_')
                    peakid = int(tokens[2])

                    # find all the fragment peaks of this parent peak
                    ms2_rows = self.ms2.loc[self.ms2['MSnParentPeakID']==peakid]
                    fragment_bin_ids = ms2_rows[['fragment_bin_id']]
                    loss_bin_ids = ms2_rows[['loss_bin_id']]

                    # convert from pandas dataframes to list
                    fragment_bin_ids = fragment_bin_ids.values.ravel().tolist()
                    loss_bin_ids = loss_bin_ids.values.ravel().tolist()

                    # this code is too slow!
                    # count the citation numbers
#                     for cited in fragment_bin_ids:
#                         if cited == 'nan':
#                             continue
#                         else:
#                             if cited in fragment_words:
#                                 fragment_words[cited] = fragment_words[cited] + 1
#                     for cited in loss_bin_ids:
#                         if cited == 'nan':
#                             continue
#                         else:
#                             if cited in loss_words:
#                                 loss_words[cited] = loss_words[cited] + 1

                    # convert to dictionary for quick lookup
                    word_dict = {}
                    for word in fragment_bin_ids:
                        word_dict.update({word:word})
                    for word in loss_bin_ids:
                        word_dict.update({word:word})

                    # count the citation numbers
                    for word in fragment_words:
                        if word in word_dict:
                            fragment_words[word] = fragment_words[word] + 1
                    for word in loss_words:
                        if word in word_dict:
                            loss_words[word] = loss_words[word] + 1

                    # make a dataframe of the articles & citation counts
                    fragment_df = DataFrame(fragment_words, index=['counts']).transpose()
                    loss_df = DataFrame(loss_words, index=['counts']).transpose()
                    df = fragment_df.append(loss_df)
                    df = df.sort(['counts'], ascending=False)

                    # compute the h-index
                    h_index = 0
                    for index, row in df.iterrows():
                        if row['counts'] > h_index:
                            h_index += 1
                        else:
                            break

                print " - Mass2Motif " + str(i) + " h-index = " + str(h_index)
                topic_counts[i] = h_index

        return topic_counts
示例#33
0
def annotateIGSeqRead(fastaFile, chain, db, noWorkers, seqsPerFile,
                      seqType='dna', outdir="", domainSystem='imgt', stream=None):
        if fastaFile is None:
            return Counter()

        # Estimate the IGV diversity in a library from igblast output 
        printto(stream, 'The IGV clones of ' + os.path.basename(fastaFile) + ' are being annotated ...')
        with open(fastaFile) as f:
            noSeqs = sum(1 for line in f if line.startswith(">"))
        totalFiles = int(ceil(noSeqs / seqsPerFile))
        if totalFiles < noWorkers:
            seqsPerFile = int(noSeqs / noWorkers) if noSeqs >= noWorkers else noSeqs
            totalFiles = int(ceil(noSeqs / seqsPerFile))
        noSplit = noSeqs <= seqsPerFile
        printto(stream, "\t{0:,} sequences were found to be distributed into {1:,} file(s)"
                .format(noSeqs, (totalFiles if not noSplit else 1)))

        # todo: commented out on Thu 21 Jun 2018 16:01:06 AEST by JIAHONG FONG
        # reason: unknown purpose - why are sequences being trimmed using the primer argument?
        # chagelog: 1. newFastFile = fastaFile regardless, and remove the primer CMD argument entirely
        # if igRep.primer > 0:
        #     with safeOpen(fastaFile) as fp:
        #         recordsAll = SeqIO.to_dict(SeqIO.parse(fp, 'fasta'))
        #     records = []
        #     for id_ in recordsAll:
        #         rec = recordsAll[id_]
        #         rec.description = ''
        #         rec.seq = rec.seq[:igRep.primer]
        #         records.append(rec)
        #     filesDir = os.path.join(outdir, "tmp")
        #     newFastFile = os.path.join(filesDir, "seqs.fasta")
        #     SeqIO.write(records, newFastFile, 'fasta')
        # else:
        #     newFastFile = fastaFile

        newFastFile = fastaFile

        # if we only asked for one worker or if the sequences within the fasta file is smaller than the threshold in
        # in seqsPerFile, we can just analyze the file without splitting it
        if noWorkers == 1 or noSplit:
            cloneAnnot, filteredIDs = analyzeSmallFile(newFastFile, chain, db,
                                                       seqType, noWorkers, outdir,
                                                       domainSystem=domainSystem, stream=stream)
        else:
            # split FASTA file into smaller files
            prefix, ext = os.path.splitext(os.path.basename(fastaFile))
            filesDir = os.path.join(outdir,  "tmp")
            prefix = prefix[prefix.find("_R")+1:prefix.find("_R")+3] + "_" if (prefix.find("_R") != -1) else ""
            splitFastaFile(fastaFile, totalFiles, seqsPerFile, filesDir, prefix, ext, stream=stream)

            # Prepare the multiprocessing queues
            tasks = Queue()    
            outcomes = Queue()   
            exitQueue = Queue()              
            cloneAnnot = DataFrame()
            filteredIDs = []
            workers = []
            try:
                # Initialize workers
                for _ in range(noWorkers):
                    w = IgBlastWorker(chain, db,
                                      seqType, int(ceil(noWorkers / totalFiles)),
                                      domainSystem=domainSystem, stream=stream)
                    w.tasksQueue = tasks
                    w.resultsQueue = outcomes
                    w.exitQueue = exitQueue      
                    workers.append(w)
                    w.start()       
                    sys.stdout.flush()

                # initialize tasks queue with file names     
                for i in range(totalFiles):
                    tasks.put(os.path.join(filesDir, prefix + "part" + str(i + 1) + ext))

                # Add a poison pill for each worker
                for _ in range(noWorkers + 10):
                    tasks.put(None)                  
               
                # Wait all process workers to terminate    
                i = 0 
                while i < noWorkers:    
                    m = exitQueue.get()
                    if m == "exit":
                        i += 1
                
                # Collect results
                printto(stream, "Results are being collated from all workers ...")
                sys.stdout.flush()
                while totalFiles:
                    outcome = outcomes.get()
                    totalFiles -= 1                    
                    if outcome is None:
                        continue                    
                    (cloneAnnoti, fileteredIDsi) = outcome
                    cloneAnnot = cloneAnnot.append(cloneAnnoti)
                    filteredIDs += fileteredIDsi
                    sys.stdout.flush()
                    gc.collect()
                printto(stream, "\tResults were collated successfully.")
                    
            except Exception:
                printto(stream, "Something went wrong during the annotation process!", LEVEL.EXCEPT)
                raise
            finally:
                for w in workers:
                    w.terminate()

            # Clean folders to save space
            # TODO: remove .fasta and .out files
            if noSeqs > seqsPerFile and os.path.exists(filesDir + os.path.sep + prefix + "part1" + ext):
                map(os.remove, glob.glob(filesDir + os.path.sep + "*" + ext))

        return cloneAnnot, filteredIDs
示例#34
0
            title='Please select your creative')
    root.destroy()
    # for root, dirs, files in os.walk(r''+cre_path+''):
    #     pass
    files = []
    for i in cre_path:
        files.append(i.split('/')[-1])

    #####################################################
    if cam_tye == '1':
        if cre_frmat == '1':
            # same+video
            # 复制行
            new_data = DataFrame()
            for i in range(len(files)):
                new_data = new_data.append(raw_data)

            # Creative Type 赋值
            new_data['Creative Type'] = 'Video Page Post Ad'

            for file_index in range(len(files)):
                # creative name 每step行数 = file
                new_data['Video File Name'][row_num * file_index:row_num *
                                            (file_index +
                                             1)] = files[file_index]

                # campaign name= oldcampaign name + creatvie name
                new_data['Campaign Name'][row_num * file_index:row_num * (
                    file_index + 1)] = new_data['Campaign Name'][
                        row_num * file_index:row_num *
                        (file_index + 1)] + '_' + files[file_index]
示例#35
0
def trial_2(login_info):  # Attempts to merge dataframes with two queries
    dsn_tns = cx_Oracle.makedsn('ora-tns-qcc1.in.qservco.com',
                                1521,
                                service_name='qcc1')
    connection = cx_Oracle.connect(user=login_info[0],
                                   password=login_info[1],
                                   dsn=dsn_tns)
    print('=' * 60)
    print('\t - Connection established.')
    query1 = '''
select distinct s.sub_id, s.sub_nm,
vtc.trans_dttm as latest_invoice_date,
vtp.trans_dttm as last_payment_date
from wasabi.v_srvc svc
join wasabi.v_sub s on s.sub_id = svc.sub_id
join wasabi.v_srvc_ct sct on sct.srvc_id = svc.srvc_id
join wasabi.v_ct_ct_num ccn on ccn.srvc_ct_id = sct.srvc_ct_id
join (
select * from wasabi.v_trans
where trans_cls = 'R'
and amt > 0
and trans_stat = 'I'
and trans_typ in (
select trans_typ from wasabi.v_trans_typ
where lower(trans_nm) like '%equip%'
)
) vtc on vtc.sub_id = s.sub_id
join (
select sub_id, max(trans_dttm) trans_dttm
from wasabi.v_trans
where trans_cls = 'P'
group by sub_id
) vtp on vtp.sub_id = s.sub_id
where svc.stat = 'A'
and (ccn.ct_num_typ = 21707 or ccn.ct_typ = 364)
and vtp.trans_dttm between trunc(sysdate)-30 and trunc(sysdate)
    '''
    query2 = '''
select 
sub_id, trans_dttm
from wasabi.v_trans
where trans_cls = 'P'
and sub_id = :sub_id
    '''
    df1 = pd.read_sql(query1, connection)
    print('\t - Read sql for df1.')
    df2 = DataFrame(columns=['SUB_ID', 'TRANS_DTTM'])
    for row in df1['SUB_ID']:
        df2.append(pd.read_sql(query2, connection, params={'sub_id': row}),
                   ignore_index=True)
    # df2 = pd.read_sql(query2, connection, params = {'sub_id': df1['SUB_ID']}
    print('\t - Read sql for df2.')
    print(f'\t - DF1 Head:\n{df1.head}')
    print(f'\t - DF2 Head:\n{df2.head}')
    df = pd.merge(df1, df2)  #, on = "sub_id")
    print('\t - Merged Dataframes.')
    print(f'\t - DF Info:\n{df.info}')
    print(f'\t - DF Head:\n{df.head}')
    filename = 'Billed_Equipment_Paid_6mo.xlsx'
    sheetname = 'Subscribers & Equipment'
    writer = pd.ExcelWriter(filename, engine='xlsxwriter')
    df.to_excel(writer, sheet_name=sheetname, header=False, index=False)
    worksheet = writer.sheets[sheetname]
    (max_row, max_col) = df.shape
    headers = [{'header': header} for header in df.columns]
    worksheet.add_table(0, 0, max_row - 1, max_col - 1, {'columns': headers})
    writer.save()
    print('\t - Finished.')
示例#36
0
        ax[i][c].set_xlabel('Decision tree importances')
        ax[i][c].set_ylabel('Features')
        ax[i][c].set_title(adt_clr.columns[i] + " random_state: " +
                           str(random[c]))
f.tight_layout()
f.savefig('E:\\ML_py\\decision_tree_feature_importance.png',
          dpi=600,
          format='png')
f.clf()

# protein CD19
i = 8
rf = RandomForestRegressor(**param)
cor = []
for j in range(0, rna_log.shape[1]):
    cor.append(
        pearsonr(adt_clr[adt_clr.columns[i]], rna_log[rna_log.columns[j]])[0])

cor = DataFrame(cor)
cor.index = rna_log.columns
cor.columns = ['Pearson']
cor = cor.sort_values(axis=0, ascending=False, by='Pearson')
cor = cor[0:20]  #the top 20 genes
rna_change = rna_log[cor.index]
x_train, x_test, y_train, y_test = train_test_split(
    rna_change, adt_clr[adt_clr.columns[i]], test_size=0.3, random_state=0)
rf = rf.fit(x_train.values, y_train.values)

f, ax = plt.subplots(2, 2,
                     figsize=(70, 50))  #figsize (figsize[2]*35,figsize[1]*25)
importance = rf.feature_importances_
importance = DataFrame(importance)
 def _h_index(self):
             
     topic_counts = {}
     n_topics = self.model.K
     for i in range(n_topics):
         
         sys.stdout.flush()
         
         # find the words in this topic above the threshold
         fragment_words = self._get_nonzero_words(i, self.topicdfs, 0)
         loss_words = self._get_nonzero_words(i, self.topicdfs, 1)
         
         # find the documents in this topic above the threshold
         topic_docs = self.docdf.ix[i, :]
         topic_docs = topic_docs.iloc[topic_docs.nonzero()[0]]
         
         # handle empty topics
         if topic_docs.empty:
             
             topic_counts[i] = 0
             
         else:
         
             # now find out how many of the documents in this topic actually 'cite' the words    
             for docname in topic_docs.index:
 
                 # split mz_rt_peakid string into tokens
                 tokens = docname.split('_')
                 peakid = int(tokens[2])
                 
                 # find all the fragment peaks of this parent peak
                 ms2_rows = self.ms2.loc[self.ms2['MSnParentPeakID']==peakid]
                 fragment_bin_ids = ms2_rows[['fragment_bin_id']]
                 loss_bin_ids = ms2_rows[['loss_bin_id']]       
                 
                 # convert from pandas dataframes to list
                 fragment_bin_ids = fragment_bin_ids.values.ravel().tolist()
                 loss_bin_ids = loss_bin_ids.values.ravel().tolist()
                 
                 # count the citation numbers
                 for cited in fragment_bin_ids:
                     if cited == 'nan':
                         continue
                     else:
                         if cited in fragment_words:
                             fragment_words[cited] = fragment_words[cited] + 1
                 for cited in loss_bin_ids:
                     if cited == 'nan':
                         continue
                     else:
                         if cited in loss_words:
                             loss_words[cited] = loss_words[cited] + 1
                 
                 # make a dataframe of the articles & citation counts
                 fragment_df = DataFrame(fragment_words, index=['counts']).transpose()
                 loss_df = DataFrame(loss_words, index=['counts']).transpose()
                 df = fragment_df.append(loss_df)
                 df = df.sort(['counts'], ascending=False)
                 
                 # compute the h-index
                 h_index = 0
                 for index, row in df.iterrows():
                     if row['counts'] > h_index:
                         h_index += 1
                     else:
                         break
 
             topic_counts[i] = h_index
         
     return topic_counts
示例#38
0
    df = pd.read_excel('data/anno_xls/19monthold36monthold.xlsx',
                       sheet_name=str(sheetid) + 'MonthOld')
    df.to_excel(writer, sheet_name=str(sheetid) + 'MonthOld', index=False)
    # print(df)
writer.save()

# Merge into one sheets
writer = pd.ExcelWriter('data/anno_xls/onesheet.xlsx', engine='xlsxwriter')

source = []
df = DataFrame()
for sheetid in range(0, 18 + 1):
    df_read = pd.read_excel('data/anno_xls/0monthold18monthold.xlsx',
                            sheet_name=str(sheetid) + 'MonthOld')
    # print(df_read)
    df = df.append(df_read, ignore_index=True)
    # print(df)
    for _ in range(len(df_read)):
        source.append(str(sheetid) + 'MonthOld.txt')
for sheetid in range(19, 36 + 1):
    df_read = pd.read_excel('data/anno_xls/19monthold36monthold.xlsx',
                            sheet_name=str(sheetid) + 'MonthOld')
    df_read = df_read.iloc[:, 0:2]
    # print(df_read)
    df = df.append(df_read, ignore_index=True)
    for _ in range(len(df_read)):
        source.append(str(sheetid) + 'MonthOld.txt')
# print(source)
df['Source'] = source
print(df.head())
print(df.tail())
def main(mytimer: func.TimerRequest, outputBlob: func.Out[str]) -> None:
    logger = logging.getLogger("logger_name")
    logger.disabled = True

    blob_service_actuals = ContainerClient.from_connection_string(
        os.environ['Blockblob'], "actuals", logger=logger)
    blob_service_pickles = ContainerClient.from_connection_string(
        os.environ['Blockblob'], "treepickles", logger=logger)

    f = blob_service_actuals.download_blob("weather_predictions.json")
    s = blob_service_actuals.download_blob("snow_depths.json")
    o = blob_service_actuals.download_blob("weather_observations.json")

    df = pd.read_json(f.content_as_text())
    dfSnow = pd.read_json(s.content_as_text())
    dfObs = pd.read_json(o.content_as_text())

    dfOut = DataFrame(columns=[
        'station_id', 'station_name', 'weather_forecast_ref_time', 'obs',
        'day', 'interval', 'prediction_sort_counter', 'prediction_time_from',
        'prediction_time_to', 'closed_prediction'
    ])
    for station_id in df["station_id"].unique():

        snow_depth = dfSnow[dfSnow['Station_id'] ==
                            station_id].reset_index().get('Snødybde').fillna(0)
        air_temp = dfObs[dfObs['Station_id'] == station_id].reset_index().get(
            'air_temp').fillna(0)
        relative_humidity = dfObs[dfObs['Station_id'] ==
                                  station_id].reset_index().get(
                                      'relative_humidity').fillna(0)
        dew_point_temp = dfObs[dfObs['Station_id'] == station_id].reset_index(
        ).get('dew_point_temp').fillna(0)
        wind_speed = dfObs[dfObs['Station_id'] == station_id].reset_index(
        ).get('wind_speed').fillna(0)
        wind_bearing = dfObs[dfObs['Station_id'] == station_id].reset_index(
        ).get('wind_bearing').fillna(0)
        min_visibility_dist = dfObs[dfObs['Station_id'] ==
                                    station_id].reset_index().get(
                                        'min_visibility_dist').fillna(0)
        precipitation_intensity = dfObs[
            dfObs['Station_id'] == station_id].reset_index().get(
                'precipitation_intensity').fillna(0)
        road_temp = dfObs[dfObs['Station_id'] == station_id].reset_index().get(
            'road_temp').fillna(0)

        for obs in range(1, 5, 1):  # 4 observasjoner pr døgn
            for day in df["prediction_day"].unique(
            ):  # looper gjennom predikerte dager [0,1,2]
                if day < 3:  # tar bort værmeldinger for 3 dager frem.
                    for interval in df["prediction_interval"].unique(
                    ):  # looper gjennom de fire tidsintervallene pr dag
                        dfInput = DataFrame()
                        for h in range(6, 55, 6):
                            r = df[(df['station_id'] == station_id)
                                   & (df['prediction_hour'] == h)].copy(
                                       deep=True).reset_index()
                            dfInput['h' + str(h) + '_air_temp'] = r['air_temp']
                            dfInput['h' + str(h) +
                                    '_precipitation_amount'] = r[
                                        'precipitation_amount'].fillna(0)
                            dfInput['h' + str(h) +
                                    '_wind_bearing_sin'] = np.sin(
                                        2 * np.pi * r['wind_bearing'] / 360.0)
                            dfInput['h' + str(h) +
                                    '_wind_bearing_cos'] = np.cos(
                                        2 * np.pi * r['wind_bearing'] / 360.0)
                            dfInput['h' + str(h) +
                                    '_wind_speed'] = r['wind_speed']
                            dfInput['h' + str(h) + '_cloud_area_fraction'] = r[
                                'cloud_area_fraction'] / 100
                            dfInput['h' + str(h) +
                                    '_air_pressure_at_sea_level'] = r[
                                        'air_pressure_at_sea_level'] * 100
                            dfInput['h' + str(h) + '_relative_humidity'] = r[
                                'relative_humidity'] / 100

                        dfInput['air_temp'] = air_temp
                        dfInput['relative_humidity'] = relative_humidity
                        dfInput['dew_point_temp'] = dew_point_temp
                        dfInput['wind_speed'] = wind_speed
                        dfInput['min_visibility_dist'] = min_visibility_dist
                        dfInput[
                            'precipitation_intensity'] = precipitation_intensity
                        dfInput['road_temp'] = road_temp
                        dfInput['snow_depth'] = snow_depth
                        dfInput['wind_bearing_sin'] = np.sin(
                            2 * np.pi * wind_bearing / 360.0)
                        dfInput['wind_bearing_cos'] = np.cos(
                            2 * np.pi * wind_bearing / 360.0)
                        dfInput['day_this_winter'] = datetime.today(
                        ).timetuple().tm_yday - 274 if 0 < datetime.today(
                        ).timetuple().tm_yday - 274 < 365 else datetime.today(
                        ).timetuple().tm_yday + 91

                        pic = blob_service_pickles.download_blob(
                            build_file_name(station_id, obs, day,
                                            interval)).readall()
                        model = RandomForestClassifier()
                        model = pickle.loads(pic)
                        ynew = model.predict_proba(dfInput.values)
                        obs_now = math.floor(datetime.now().hour / 6) + 1
                        prediction_time_from = datetime.now().replace(
                            hour=0, minute=0, second=0,
                            microsecond=0) + timedelta(hours=int((day * 24) + (
                                (interval - 1) * 6)))
                        prediction_time_to = prediction_time_from + timedelta(
                            hours=6)
                        if (obs == obs_now) and (prediction_time_from >
                                                 datetime.now()):
                            dfOut = dfOut.append(
                                {
                                    'station_id':
                                    station_id,
                                    'station_name':
                                    road_stations.get(station_id),
                                    'weather_forecast_ref_time':
                                    datetime_from_utc_to_local(
                                        parse(r['forecast_ref_time_zulu'][0])).
                                    strftime("%d.%m.%Y %H:%M:%S"),
                                    'obs':
                                    obs,
                                    'day':
                                    day,
                                    'interval':
                                    interval,
                                    'prediction_sort_counter':
                                    prediction_time_from,
                                    'prediction_time_from':
                                    prediction_time_from.strftime(
                                        "%d.%m.%Y %H:%M:%S"),
                                    'prediction_time_to':
                                    prediction_time_to.strftime(
                                        "%d.%m.%Y %H:%M:%S"),
                                    'closed_prediction':
                                    ynew[0][1]
                                },
                                ignore_index=True)

    dfOut = dfOut.sort_values(by=['station_id', 'obs', 'day', 'interval'])
    outputBlob.set(
        dfOut.to_json(orient='records',
                      force_ascii=False,
                      indent=2,
                      index=True))