def writeTDE(q, outputFileName, types): # Define type maps typesTable = { 'Bool': tde.Type.BOOLEAN, 'Int': tde.Type.INTEGER, 'Double': tde.Type.DOUBLE, 'Date': tde.Type.DATE, 'DateTime': tde.Type.DATETIME, 'Duration': tde.Type.DURATION, 'CharString': tde.Type.CHAR_STRING, 'UnicodeString': tde.Type.UNICODE_STRING } # Step 1: Create the Extract file and open the .csv tdefile = tde.Extract(outputFileName + '.tde') if tdefile.hasTable('Extract'): table = tdefile.openTable('Extract') tableDef = table.getTableDefinition() columns = q.get() numColumns = len(columns) else: # Step 2: Create the tableDef tableDef = tde.TableDefinition() columns = q.get() numColumns = len(columns) for i in range(numColumns): tableDef.addColumn(columns[i], typesTable[types[i]]) # Step 3: Create the table in the image of the tableDef table = tdefile.addTable('Extract', tableDef) # Step 4: Loop through the csv, grab all the data, put it into rows # and insert the rows into the table newrow = tde.Row(tableDef) while True: nextLine = q.get() if nextLine: for columnIndex in range(numColumns): insertRow(newrow, columnIndex, nextLine[columnIndex], types[columnIndex]) table.insert(newrow) else: break # Step 5: Close the tde tdefile.close()
def write_tde(table_df, tde_fullpath, arg_append): """ Writes the given pandas dataframe to the Tableau Data Extract given by tde_fullpath """ if arg_append and not os.path.isfile(tde_fullpath): print "Couldn't append -- file doesn't exist" arg_append = False # Remove it if already exists if not arg_append and os.path.exists(tde_fullpath): os.remove(tde_fullpath) tdefile = tde.Extract(tde_fullpath) # define the table definition table_def = tde.TableDefinition() # create a list of column names colnames = table_df.columns # create a list of column types coltypes = table_df.dtypes # for each column, add the appropriate info the Table Definition for col_idx in range(0, len(colnames)): cname = colnames[col_idx] ctype = fieldMap[str(coltypes[col_idx])] table_def.addColumn(cname, ctype) # create the extract from the Table Definition if arg_append: tde_table = tdefile.openTable('Extract') else: tde_table = tdefile.addTable('Extract', table_def) row = tde.Row(table_def) for r in range(0, table_df.shape[0]): for c in range(0, len(coltypes)): if str(coltypes[c]) == 'float64': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'float32': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'int64': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'int32': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'object': row.setString(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'bool': row.setBoolean(c, table_df.iloc[r, c]) else: row.setNull(c) # insert the row tde_table.insert(row) tdefile.close() print "Wrote %d lines to %s" % (len(table_df), tde_fullpath)
def pd_tde(df,fname): try: os.system('rm -f ' + fname) os.system('rm -f DataExtract*.log') tdefile = tde.Extract(fname) except: tdefile = tde.Extract(fname) tableDef = tde.TableDefinition() colnames = df.columns coltypes = df.dtypes # for each column, add the appropriate info the Table Definition for i in range(0, len(colnames)): cname = colnames[i] ctype = fieldMap.get(str(coltypes[i])) tableDef.addColumn(cname, ctype) with tdefile as extract: table = extract.addTable("Extract", tableDef) for r in range(0, df.shape[0]): row = tde.Row(tableDef) for c in range(0, len(coltypes)): if str(coltypes[c]) == 'float64': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'float32': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'int64': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'int32': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'object': row.setString(c, df.iloc[r,c]) elif str(coltypes[c]) == 'bool': row.setBoolean(c, df.iloc[r,c]) else: row.setNull(c) # insert the row table.insert(row)
break # got stuff, we're out csvfile.seek(0) # BACK TO THE FRONT! (AGAIN!) os.chdir(targetdir) try: # Just in case the file exists already, we don't want to bomb out tdefile = tde.Extract(tdefilename) # in CWD except: os.system('del ' + targetdir + tdefilename) os.system( 'del DataExtract.log') #might as well erase this bitch too tdefile = tde.Extract(targetdir + tdefilename) # ok lets build the table definition in TDE with our list of names and types first # replacing literals with TDE datatype integers, etc tableDef = tde.TableDefinition() #create a new table def numfields = len(dfields) #print numfields if rowoutput == True: print '*** field names list ***' # debug for t in range(numfields): fieldtypeo = dtypes[t].replace("<type '", "").replace( "'>", "").replace("<class '", "").replace('NoneType', 'str').replace('uuid.UUID', 'str') fieldname = dfields[t] fieldtype = str(fieldtypeo).replace("str", "15").replace( "datetime.datetime", "13").replace("int", "7").replace(
j = c.user_bundle_history() myBundles = j['bundles'] # open the extract try: # Just for testing purposes and re-running tdefile = tde.Extract('bitlyDetails.tde') #in CWD except: os.remove('bitlyDetails.tde') os.remove('DataExtract.log') tdefile = tde.Extract('bitlyDetails.tde') print 'tde ok' # make the extract and define the columns tableDef = tde.TableDefinition() tableDef.addColumn("bundle_link", tde.Type.CHAR_STRING) tableDef.addColumn("bundle_title", tde.Type.CHAR_STRING) tableDef.addColumn("bundle_owner", tde.Type.CHAR_STRING) tableDef.addColumn("bundle_created", tde.Type.DOUBLE) tableDef.addColumn("bundle_description", tde.Type.CHAR_STRING) tableDef.addColumn("bundle_last_mod", tde.Type.DOUBLE) tableDef.addColumn("bundle_IsPriv", tde.Type.BOOLEAN) tableDef.addColumn("link_title", tde.Type.CHAR_STRING) tableDef.addColumn("link_aggregate_link", tde.Type.CHAR_STRING) tableDef.addColumn("link_long_url", tde.Type.CHAR_STRING) tableDef.addColumn("link_link", tde.Type.CHAR_STRING) tableDef.addColumn("clickCountry", tde.Type.CHAR_STRING) tableDef.addColumn("clickCount", tde.Type.DOUBLE) table = tdefile.addTable('Extract', tableDef)
def extract(file_name): # move file to /extract # cd to /extract # if there is no extract called TRACK_TERM then create one, otherwise append to TRACK_TERM # define data model for extract global WORKING_DIRECTORY, TRACK_TERM from_path = WORKING_DIRECTORY + '/' + file_name to_path = WORKING_DIRECTORY + '/extract/' + file_name os.rename(from_path, to_path) os.chdir(WORKING_DIRECTORY + '/extract') # define the extract with tde.Extract(TRACK_TERM + '.tde') as extract: tableDef = tde.TableDefinition() # define the columns and the data types in the extract tableDef.addColumn('lang', tde.Type.CHAR_STRING) #0 tableDef.addColumn('sentiment', tde.Type.DOUBLE) #1 tableDef.addColumn('country', tde.Type.CHAR_STRING) #2 tableDef.addColumn('created_at', tde.Type.DATETIME) #3 tableDef.addColumn('tweet_text', tde.Type.CHAR_STRING) #4 tableDef.addColumn('Longitude', tde.Type.DOUBLE) #5 tableDef.addColumn('source', tde.Type.CHAR_STRING) #6 tableDef.addColumn('user', tde.Type.CHAR_STRING) #7 tableDef.addColumn('Latitude', tde.Type.DOUBLE) #8 table = None if not extract.hasTable('Extract'): # Table does not exist, so create it. print "Creating a new extract" table = extract.addTable('Extract', tableDef) else: # Table exists, so append the new data. print "Appending to an existing extract" table = extract.openTable('Extract') new_row = tde.Row(tableDef) # read the data from the CSV into the extract row object with open(file_name, 'r') as inf: reader = csv.DictReader(inf, delimiter=',', lineterminator='\n') for row in reader: # insert data into the row object in the correct order as defined above new_row.setCharString(0, row['lang']) sentiment = float(row['sentiment']) new_row.setDouble(1, sentiment) new_row.setCharString(2, row['country']) # parse the twitter date string: # Mon Sep 21 11:03:53 +0000 2015 # %a %b %d %H:%M:%S +0000 %Y date_object = datetime.strptime(row['created_at'], '%a %b %d %H:%M:%S +0000 %Y') year = int(datetime.strftime(date_object, '%Y')) month = int(datetime.strftime(date_object, '%m')) day = int(datetime.strftime(date_object, '%d')) hour = int(datetime.strftime(date_object, '%H')) min = int(datetime.strftime(date_object, '%M')) sec = int(datetime.strftime(date_object, '%S')) frac = 0 # fractions of a second aka milliseconds new_row.setDateTime(3, year, month, day, hour, min, sec, frac) new_row.setCharString(4, row['tweet_text']) # check if there is a value for longitude, otherwise write a 0 try: longitude = float(row['longitude']) except: longitude = 0 new_row.setDouble(5, longitude) new_row.setCharString(6, row['source']) new_row.setCharString(7, row['user']) # check if there is a value for latitude, otherwise write a 0 try: latitude = float(row['latitude']) except: latitude = 0 new_row.setDouble(8, latitude) table.insert(new_row) # if the process fails we want to be able to re-run it without collisions between file names # so give each file a unique name (unix time stamp in this case). os.rename(file_name, str(time.time()).split('.')[0] + '.csv') # cd back to working directory os.chdir(WORKING_DIRECTORY) return
def createTDEFile(): # This part is unique for each extract. # Will need to modify.. # TODO: Make this more dynamic ################################################################################################ ################################################################################################ csvSchema = [] csvSchema.append({'<COLUMNNAME>': tdeTypes['INTEGER']}) ################################################################################################ ################################################################################################ # Try to create the TDE File try: tdeFile = tde.Extract(outputfile + '.tde') except: os.remove(outputfile + '.tde') tdeFile = tde.Extract(outputfile + '.tde') # Open newly created CSV File csvFile = open(outputfile + '.csv', "r") reader = csv.reader(csvFile) print 'Reading records from %s' % (outputfile + '.csv') # Create TDE Table definition tdeTableDef = tde.TableDefinition() # Build TDE Table Definition from csv schema above print 'Defined table schema:' for index, item in enumerate(csvSchema): for k, v in item.items(): print 'Column %i: %s <%s>' % ( index, k, tdeTypes.keys()[tdeTypes.values().index(v)]) tdeTableDef.addColumn(k, v) # Add table to TDE File tdeTable = tdeFile.addTable("Extract", tdeTableDef) # Iterate through rows and columns of csv and adding them to TDE File print 'Writing records to %s' % (csvName + '.tde') rownum = 0 for row in reader: if rownum == 0: header = row else: colnum = 0 tdeRow = tde.Row(tdeTableDef) for col in row: if colnum + 1 > len(csvSchema): print 'Something is missing here.' break add_tde_col(colnum, tdeRow, row[colnum], csvSchema[colnum].values()[0]) colnum += 1 tdeTable.insert(tdeRow) tdeRow.close() rownum += 1 print '%i rows added in total in %f seconds' % (rownum - 1, time.clock() - startTime) print 'Closing TDE and CSV File...' tdeFile.close() csvFile.close()
import pandas import dataextract as tde # step 02 get the data into python # ######################################################### url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] dataset = pandas.read_csv(url, names=names) # step 03 create a blank extract # ######################################################### dataExtract = tde.Extract('irisExtract.tde') # step 04 define schema # ######################################################### dataSchema = tde.TableDefinition() dataSchema.addColumn('sepal-length', tde.Type.DOUBLE) dataSchema.addColumn('sepal-width', tde.Type.DOUBLE) dataSchema.addColumn('petal-length', tde.Type.DOUBLE) dataSchema.addColumn('petal-width', tde.Type.DOUBLE) dataSchema.addColumn('class', tde.Type.CHAR_STRING) # step 05 connect schema with blank extract # ######################################################### table = dataExtract.addTable('Extract', dataSchema) # step 06 fill extract with data # ######################################################### newRow = tde.Row(dataSchema) for i in range(0, len(dataset)): newRow.setDouble (0, dataset['sepal-length'][i])