Exemplo n.º 1
0
from tableausdk import Extract as tde  # **Module change from py2 for py3
from tableausdk import Types as tdeTypes  # **
import os

#Step 1: Create the Extract File
if os.path.isfile('trivialExample.tde'):
    os.remove('trivialExample.tde')

tdefile = tde.Extract('trivialExample.tde')

#Step 2: Create the tableDef
tableDef = tde.TableDefinition()
tableDef.addColumn('rowID', tdeTypes.Type.CHAR_STRING)  # Column index 0. **
tableDef.addColumn('value', tdeTypes.Type.DOUBLE)  #Column index 1. **

#Step 3: Create the table in the image of the tableDef
table = tdefile.addTable('Extract', tableDef)

#Step 4: Create some rows and insert them into the table
newrow = tde.Row(tableDef)
for i in range(1, 101):
    newrow.setCharString(0, 'Row ' + str(i))
    newrow.setDouble(1, i)
    table.insert(newrow)

#Step 5: Close the tde
tdefile.close()
Exemplo n.º 2
0
    def build(self):
        # Set filename
        tdefileName = self.tde_settings_ins.tde_file
        if os.path.isfile(tdefileName):
            os.remove(tdefileName)
        #set the extract object instance
        tdefile = tdeEx.Extract(tdefileName)
        # create the column schema
        tableDef = tdeEx.TableDefinition()
        columnsListedInOrder = []
        outputColIndex = {}
        outputColNumber = 0
        for columnName, index in sorted(
                self.tde_settings_ins.columnIndex.items(), key=lambda x: x[1]):
            BasicDataType = self.tde_settings_ins.columns[columnName]
            tableDef.addColumn(columnName,
                               tdeSettings.schemaIniTypeMap[BasicDataType])
            outputColIndex[columnName] = outputColNumber
            outputColNumber += 1
        #add the column schema to the ojbect instance of EXTRACT
        table = tdefile.addTable("Extract", tableDef)
        rowNo = 0

        rowsIterator = self.tde_settings_ins.inputInfo.fileInformation.yieldRowsBase(
        )
        printedBool = False
        startTime = datetime.datetime.now()
        print "Build start time:", datetime.datetime.strftime(
            startTime, '%Y-%m-%d %H:%M:%S')
        prevPct = -1
        for myReaderRow in rowsIterator:
            putRow = tdeEx.Row(tableDef)
            for columnName, index in self.tde_settings_ins.columnIndex.items():
                #if not column headers need to change this to index of the column
                if (tdeSettings.schemaIniTypeMap[
                        self.tde_settings_ins.columns[columnName]] !=
                        tde.Type.UNICODE_STRING and myReaderRow[index] == ""):
                    putRow.setNull(outputColIndex[columnName])
                else:
                    try:
                        #valueDecoded=myReaderRow[index].decode(self.tde_settings_ins.inputInfo.fileInformation.encodeing)
                        self.fieldSetterMap[tdeSettings.schemaIniTypeMap[
                            self.tde_settings_ins.columns[columnName]]](
                                putRow, outputColIndex[columnName],
                                myReaderRow[index])
                    except:
                        print "column name", columnName, "contains invalid data"
                        print "value: ", myReaderRow[index]
                        print sys.exc_info()[0]
                        raise
            table.insert(putRow)
            # Output progress line
            #only print percent one time when we hit that percent
            rowNo += 1
            pct = math.floor(
                self.tde_settings_ins.inputInfo.fileInformation.getPercent(
                    rowNo))
            #            print "row:",rowNo
            #            print "pct:",pct
            #            print "value: ",pct %2
            if pct % 2 == 0:
                if printedBool == False or prevPct != pct:
                    print "Percent:", round(pct, 0), "%, row:", rowNo
                    printedBool = True
                    prevPct = pct
            else:
                printedBool = False
        print "tde file built to:", tdefileName
        endTime = datetime.datetime.now()
        print "End Time:", endTime
        diff = endTime - startTime
        print "Build elapsed time:", diff
Exemplo n.º 3
0
def create(target_filepath, dataframe):
    column = []
    ## 宣告Dataframe(左)pandas資料格式 對應 到TDE的資料格式(右)。
    fieldMap = {
        'float64': tde.Types.Type.DOUBLE,
        'float32': tde.Types.Type.DOUBLE,
        'int64': tde.Types.Type.INTEGER,
        'int8': tde.Types.Type.INTEGER,
        'object': tde.Types.Type.UNICODE_STRING,
        'bool': tde.Types.Type.DOUBLE,
        'datetime64[ns]': tde.Types.Type.DATETIME
    }

    ## 若TDE已存在,則刪除。
    if os.path.isfile(target_filepath):
        os.remove(target_filepath)

    tdefile = tde.Extract(target_filepath)
    schema = tde.TableDefinition()  # define the table definition

    colnames = dataframe.columns  # dataframe all column name
    coltypes = dataframe.dtypes  # dataframe all column's datatype

    ## 根據Dataframe建立Tde Schema
    for i in range(0, len(colnames)):
        cname = colnames[i]
        ctype = fieldMap.get(str(coltypes[i]))
        schema.addColumn(cname, ctype)  # 加入一行欄位(名稱,資料格式)
        column.append(cname)

    ## 開啟TDE FILE寫入資料
    with tdefile as extract:
        table = extract.addTable("Extract", schema)
        for r in range(0, dataframe.shape[0]):
            row = tde.Row(schema)
            ## Loop 顯示剩餘筆數
            count = dataframe.shape[0] - r - 1
            print('剩' + str(count) + '筆', end="\r")
            ## 每筆資料依照格式寫入TDE內
            for c in range(0, len(coltypes)):
                if str(coltypes[c]) == 'float64':
                    if dataframe.iloc[r, c] is None:
                        row.setNull(c)
                    else:
                        row.setDouble(c, dataframe.iloc[r, c])
                elif str(coltypes[c]) == 'float32':
                    row.setDouble(c, dataframe.iloc[r, c])
                elif str(coltypes[c]) == 'int64':
                    if dataframe.iloc[r, c] is None:
                        row.setNull(c)
                    else:
                        row.setInteger(c, dataframe.iloc[r, c])
                elif str(coltypes[c]) == 'int8':
                    row.setInteger(c, dataframe.iloc[r, c])
                elif str(coltypes[c]) == 'object':
                    if dataframe.iloc[r, c] is None:
                        row.setNull(c)
                    else:
                        row.setString(c, dataframe.iloc[r, c])
                elif str(coltypes[c]) == 'bool':
                    row.setBoolean(c, dataframe.iloc[r, c])
                elif str(coltypes[c]) == 'datetime64[ns]':
                    try:
                        dt = dataframe.iloc[r, c]
                        row.setDateTime(c, dt.year, dt.month, dt.day, dt.hour,
                                        dt.minute, dt.second, 0)
                    except:
                        row.setNull(c)
                else:
                    row.setNull(c)
            ## insert the row
            table.insert(row)
    ## close the tdefile
    tdefile.close()
Exemplo n.º 4
0
def extract(file_name):
    # move file to /extract
    # cd to /extract
    # if there is no extract called TRACK_TERM then create one, otherwise append to TRACK_TERM
    # define data model for extract

    global WORKING_DIRECTORY, TRACK_TERM

    if not os.path.exists(WORKING_DIRECTORY + '/extract/'):
        os.makedirs(WORKING_DIRECTORY + '/extract/')

    from_path = WORKING_DIRECTORY + '/' + file_name
    to_path = WORKING_DIRECTORY + '/extract/' + file_name

    os.rename(from_path, to_path)

    os.chdir(WORKING_DIRECTORY + '/extract')

    # define the extract
    with data.Extract(TRACK_TERM + '.tde') as extract:

        tableDef = data.TableDefinition()

        # define the columns and the data types in the extract
        tableDef.addColumn('lang', types.Type.CHAR_STRING)  #0
        tableDef.addColumn('sentiment', types.Type.DOUBLE)  #1
        tableDef.addColumn('country', types.Type.CHAR_STRING)  #2
        tableDef.addColumn('created_at', types.Type.DATETIME)  #3
        tableDef.addColumn('tweet_text', types.Type.CHAR_STRING)  #4
        tableDef.addColumn('Longitude', types.Type.DOUBLE)  #5
        tableDef.addColumn('source', types.Type.CHAR_STRING)  #6
        tableDef.addColumn('user', types.Type.CHAR_STRING)  #7
        tableDef.addColumn('Latitude', types.Type.DOUBLE)  #8

        table = None

        if not extract.hasTable('Extract'):
            # Table does not exist, so create it.
            print "Creating a new extract"
            table = extract.addTable('Extract', tableDef)
        else:
            # Table exists, so append the new data.
            print "Appending to an existing extract"
            table = extract.openTable('Extract')

        new_row = data.Row(tableDef)

        # read the data from the CSV into the extract row object
        with open(file_name, 'r') as inf:
            reader = csv.DictReader(inf, delimiter=',', lineterminator='\n')
            for row in reader:
                # insert data into the row object in the correct order as defined above
                new_row.setCharString(0, row['lang'])

                sentiment = float(row['sentiment'])
                new_row.setDouble(1, sentiment)

                new_row.setCharString(2, row['country'])

                # parse the twitter date string:
                # Mon Sep 21 11:03:53 +0000 2015
                # %a %b %d %H:%M:%S +0000 %Y
                date_object = datetime.strptime(row['created_at'],
                                                '%a %b %d %H:%M:%S +0000 %Y')
                year = int(datetime.strftime(date_object, '%Y'))
                month = int(datetime.strftime(date_object, '%m'))
                day = int(datetime.strftime(date_object, '%d'))
                hour = int(datetime.strftime(date_object, '%H'))
                min = int(datetime.strftime(date_object, '%M'))
                sec = int(datetime.strftime(date_object, '%S'))
                frac = 0  # fractions of a second aka milliseconds
                new_row.setDateTime(3, year, month, day, hour, min, sec, frac)

                new_row.setCharString(4, row['tweet_text'])

                # check if there is a value for longitude, otherwise write a 0
                try:
                    longitude = float(row['longitude'])
                except:
                    longitude = 0
                new_row.setDouble(5, longitude)

                new_row.setCharString(6, row['source'])
                new_row.setCharString(7, row['user'])

                # check if there is a value for latitude, otherwise write a 0
                try:
                    latitude = float(row['latitude'])
                except:
                    latitude = 0
                new_row.setDouble(8, latitude)

                table.insert(new_row)

    # if the process fails we want to be able to re-run it without collisions between file names
    # so give each file a unique name (unix time stamp in this case).
    os.rename(file_name, str(time.time()).split('.')[0] + '.csv')

    # cd back to working directory
    os.chdir(WORKING_DIRECTORY)

    return