示例#1
0
def createNewCSV(fileOutput, timeInterval: int):
    heads = []
    heads.append('Date')
    heads.append('Attr')

    counter = int(24 * 60 / timeInterval)
    # Minute style
    if timeInterval < 60:
        for i in range(counter):
            tmpM = (i * timeInterval) % 60
            tmpH = (i * timeInterval) / 60
            if tmpM == 0:
                heads.append(f"{int(tmpH)}:00")
            else:
                heads.append(f"{int(tmpH)}:{tmpM}")
    # Hour style
    else:
        for i in range(counter):
            heads.append(f"{i}:00")

    heads.append('label')
    vals = commonUtil.d(heads)

    #tmpName = raw_name.split(os.sep)[1]
    fileUtil.saveDataToCSV(fileOutput, vals)
def rwCSV(filesAggregated: list,
          fileOutput: str,
          needColumns: list,
          Default: list,
          pivot='datetime') -> list:

    files = []
    raw_output = fileOutput
    fileOutput = commonUtil.getPath(fileOutput)
    for f in filesAggregated:

        raw_name = f
        fileInput = commonUtil.getPath(f)

        data = fileUtil.readFromFileToData(fileInput=fileInput)
        # Default columns in the last columns
        vals = commonUtil.d(needColumns, Default)
        for attr in needColumns + Default:
            vals[attr] = data[attr]

        tmpName = raw_name.split(os.sep)[1]

        # Lost Order
        fileUtil.saveDataToCSV(fileOutput + f"_{tmpName}", vals)
        files.append(raw_output + f"_{tmpName}")

    return files
def prediction_flow(csv_filepath, prediction_filepath, model_filepath, labels,
                    YESTERDAY):
    """
    Prediction flows reads the file, removes columns not allowed in ml model,
    makes predictions, appends predictions to original dataframe, filters out necessary columns

    args:
        csv_filepath str Filepath of the aggregated csv
        model_filepath str Filepath of the joblib file
        labels list List of strings for the label, order matters
        remove_columns list List of strings for columns that need to be removed before placing in model
        retain_columns list List of strings for columns that need to be kept overall before storing in DB

    """
    # create list for predictions
    predictions = []

    # Load our pre-trained model
    model_filepath = commonUtil.getPath(model_filepath)
    clf = load(model_filepath)

    # read the data_temperature, header included
    csv_filepath = commonUtil.getPath(csv_filepath)
    df_model_input = fileUtil.retainTimeColumnsInCSV(csv_filepath)

    # make a list of prediction labels
    for row in df_model_input.values:
        prediction = make_predictions(clf, row, labels)
        predictions.append(prediction)

    prediction_filepath = commonUtil.getPath(prediction_filepath)
    vals = commonUtil.d(['Date', 'Label', 'Frequency'])
    fileUtil.saveDataToCSV(prediction_filepath, vals)

    res = []
    for label in labels:
        tmp = [YESTERDAY, label]
        tmp.append(predictions.count(label))
        res.append(tmp)
    fileUtil.addLinesToCSV(prediction_filepath, res)
示例#4
0
def rwCSV(filesAggregated: list,
          fileOutput: str,
          UserInputColumns: list,
          APPENDIX=APPENDIX,
          normalizeMethod=zscore) -> list:
    files = []
    raw_output = fileOutput
    fileOutput = commonUtil.getPath(fileOutput)
    for f in filesAggregated:

        raw_name = f
        fileInput = commonUtil.getPath(f)

        data = fileUtil.readFromFileToData(fileInput)

        vals = commonUtil.d(UserInputColumns)

        for attr in UserInputColumns:
            vals[attr] = normalizeMethod(data, attr)
        # last columns is always APPENDIX, which is usually as 'Minute', 'Hour' 'Day' 'Month' 'Year'
        for attr in APPENDIX:
            vals[attr] = data[attr]

        # delete 'nan' value
        newVals = {}
        for key in vals.keys():
            if not np.isnan(vals[key][0]):
                newVals[key] = vals[key]
            else:
                newVals[key] = 0

        tmpName = raw_name.split(os.sep)[1]
        fileUtil.saveDataToCSV(fileOutput + f"_{tmpName}", newVals)
        files.append(raw_output + f"_{tmpName}")

    return files
示例#5
0
 def createNewCSV(self):
     self.initConnection()
     vals = commonUtil.d(self.head)
     fileUtil.saveDataToCSV(self.fileOutput, vals)
示例#6
0
def rwCSV(fileInput: str, fileOutput: str,
          timeIntervalList: list,
          timeFormat=timeFormat, pivot='datetime', appendColumns=appendColumns) -> list:
    if len(timeIntervalList) == 0:
        timeIntervalList = ['1hour']

    raw_name = fileOutput

    path = commonUtil.getPath(fileInput)
    fileOutput = commonUtil.getPath(fileOutput)
    data = fileUtil.readFromFileToData(path)

    data_len = len(data)

    # No order
    allColumns = data.columns.tolist()
    time_col = data[pivot]

    len_timeIntervals_sorted, timeIntervals_sorted = commonUtil.getIntervalList(timeIntervalList)
    selecColumns = []
    for col in allColumns:
        if col != pivot and (re.findall('id', col, flags=re.IGNORECASE) == []):
            selecColumns.append(col)

    vals = commonUtil.d(timeIntervals_sorted)
    for i in range(len_timeIntervals_sorted):
        vals[i] = commonUtil.d(selecColumns, appendColumns)

    # Not recommend to write inline function like this, but in order to get high speed
    def timeAggregate(startHourTimeStamp, startLine, endLine, interval: int):

        localTime = time.localtime(startHourTimeStamp[interval])

        timeSpan = endLine - startLine + 1
        alertLevel = int(timeSpan * 0.75)

        print(f"aggregating")

        # (1)AVERAGE VALUE CAL
        # (2)75 percent is 0, then 0
        for attr in selecColumns:
            # scan from startLine to endLine
            vList = data[attr][startLine:endLine + 1]
            counter = vList.tolist().count(0)

            if counter < alertLevel:
                value = sum(vList / timeSpan)
                vals[interval][attr].append(value)
            else:
                vals[interval][attr].append(0)

        # HARDCODE
        # TODO Currently no need to modify, not sure in the future version
        vals[interval]['Minute'].append(localTime.tm_min)
        vals[interval]['Hour'].append(localTime.tm_hour)
        vals[interval]['Day'].append(localTime.tm_mday)
        vals[interval]['Month'].append(localTime.tm_mon)
        vals[interval]['Year'].append(localTime.tm_year)

    startLine = [0 for i in range(len_timeIntervals_sorted)]
    lastStartLine = [0 for i in range(len_timeIntervals_sorted)]
    lastEndLine = [0 for i in range(len_timeIntervals_sorted)]

    data_raw_time = time_col[0]
    lastHourTimeStamp = [time.mktime(time.strptime(data_raw_time, timeFormat)) for i in range(len_timeIntervals_sorted)]
    currentHour = 0
    for lineIdx in range(data_len):

        data_raw_time = time_col[lineIdx]
        currentHour = time.mktime(time.strptime(data_raw_time, timeFormat))

        # Layer combination
        # Start a new time set
        for idx, timeInterval in enumerate(timeIntervals_sorted):
            if currentHour - lastHourTimeStamp[idx] >= timeInterval * 60:
                startLine[idx] = lineIdx
                lastEndLine[idx] = lineIdx - 1
                timeAggregate(lastHourTimeStamp, lastStartLine[idx], lastEndLine[idx], idx)
                lastHourTimeStamp[idx] = currentHour

            lastStartLine[idx] = startLine[idx]

    timeAggregate(lastHourTimeStamp, lastStartLine[idx], data_len - 1, idx)

    files = []
    for idx, timeInterval in enumerate(timeIntervals_sorted):
        fileUtil.saveDataToCSV(fileOutput + f"_{timeInterval}.csv", vals[idx])

        files.append(raw_name + f"_{timeInterval}.csv")

    return files