def test_01_create_file(self): fle2 = cl.DataTable(fname2, 'file') fle2.save(fname2, ['test data', 'another line', 'final line']) fle3 = cl.DataTable(fname3, 'file') file_contents = fle2.load(fname) self.assertEqual(len(file_contents), 157) fle3.drop(fname)
def main(): """ This is an example of project documentation using AIKIF It documents the project itself, including requirements, design, test, goals, """ print('Initialising AIKIF Project...') name = 'AIKIF' type = 'Software' desc = """ Artificial Intelligence Knowledge Information Framework - Project Overview This document was autogenerated via aikif/examples/AIKIF_project.py """ desc += '\n Last updated ' + mod_dt.TodayAsString() fldr = os.getcwd() # 'T:\\user\\dev\\src\\python\\AIKIF\\aikif\\examples\\AIKIF_project' report_file_base = fldr + os.sep + 'aikif_report' p = mod_prj.Project(name, type, desc, fldr) project_setup(p) requirements = mod_dt.DataTable('requirements.csv', ',', col_names=['id', 'dep_id', 'name', 'details']) p.log_table(requirements) p.record(requirements, '', ['a', '', 'process data', 'automatically process source files to tables based on rules']) p.record(requirements, '', ['b', '', 'define structures', 'use mappings and ontology to specify what to do']) p.record(requirements, '', ['c', '', 'log intent', 'log key events']) p.record(requirements, '', ['d', '', 'methods toolbox', 'implement a set of programs that can be used generically']) p.record(requirements, '', ['e', '', 'Command Line Interface', 'CLI to query and update datasets and control processes']) p.record(requirements, '', ['a01', 'a', 'download CSV', 'download CSV file from website']) p.record(requirements, '', ['a02', 'a', 'load CSV to table', 'import CSV file to Database table']) p.record(requirements, '', ['a03', 'a', 'find blank rows in CSV', 'read CSV file and count DQ issues']) p.record(requirements, '', ['a04', 'a', 'aggregate table', 'summarise Database table by col(n)']) p.record(requirements, '', ['e01', 'e', 'CLI commands', 'CLI to manage commands and modes of operation']) p.record(requirements, '', ['e02', 'e', 'CLI query', 'query functions of datasets in AIKIF using basic fixed commands']) p.record(requirements, '', ['e03', 'e', 'CLI NLP', 'integrate NLP to allow english questions and data addition in add/query mode']) p.record(requirements, '', ['e04', 'e', 'CLI add', 'allows user to add data to general or specific datasets']) p.record(requirements, '', ['e05', 'e', 'CLI process', 'allows managing of all processes in AIKIF']) progress = mod_dt.DataTable('progress.csv', ',', col_names=['program', 'percent', 'details']) p.log_table(progress) p.record(progress, '', ['knowledge', '1%', 'class to manage raw data to information']) p.record(progress, '', ['mapper', '20%', 'mapping columns to data structures, with business rules']) p.record(progress, '', ['sql_code_generator', '90%', 'generates SQL to transform data external to AIKIF']) issues = mod_dt.DataTable('issues.csv', ',', col_names=['id', 'name', 'details']) p.log_table(issues) p.record(issues, '', ['01', 'In Progress', 'implement AIKIF project logging']) p.record(issues, '', ['02', 'todo', 'implement Knowledge mapping']) p.record(issues, '', ['03', 'Testing', 'data mapping of columns']) # p.build_report(report_file_base + '.md', type='md') p.build_report(report_file_base + '.rst', type='rst') print('Done...')
def test_06_create_blank_data_structure(self): dat = cl.DataTable('sales.csv', ',', col_names=['date', 'amount', 'details']) self.assertEqual(dat.col_names[0], 'date') self.assertEqual(dat.col_names[1], 'amount') self.assertEqual(dat.col_names[2], 'details')
def test_05_update_where1(self): fle = cl.DataTable(fname, ',') fle.load_to_array() dist_cols = fle.get_distinct_values_from_cols(['TERM', 'ID']) fle.add_cols(['RNK_tot1', 'RNK_tot2']) for new_col in ['tot1', 'tot2']: for i in dist_cols: first, third, median = fle.calc_percentiles( new_col, ['TERM', 'ID'], [str(i[0]), str(i[1])]) fle.update_where('RNK_' + new_col, first, ['TERM', 'ID'], [str(i[0]), str(i[1])]) fle.save_csv(fname3) """ =========================================================== TERM GENDER ID tot1 tot2 RNK_tot1RNK_tot2 5300 F 00078 18 66 18 66 7310 M 00078 10 12 14.0 7.0 7310 M 00078 18 465 14.0 7.0 7310 F 00078 30 2 14.0 7.0 7310 F 00016 25 12 35.5 227.25 5300 M 00016 31 0 31 0 7310 F 00016 67 873 35.5 227.25 """ self.assertEqual( fle.get_header(), ['TERM', 'GENDER', 'ID', 'tot1', 'tot2', 'RNK_tot1', 'RNK_tot2']) self.assertEqual(fle.arr[0][3], '18') self.assertEqual(fle.arr[0][4], '66') self.assertEqual(fle.arr[1][5], 14.0) self.assertEqual(fle.arr[1][6], 7.0) self.assertEqual(fle.arr[6][6], 227.25)
def create_map_from_file(self, data_filename): """ reads the data_filename into a matrix and calls the main function '' to generate a .rule file based on the data in the map For all datafiles mapped, there exists a .rule file to define it """ op_filename = data_filename + '.rule' dataset = mod_datatable.DataTable(data_filename, ',') dataset.load_to_array() l_map = self.generate_map_from_dataset(dataset) with open(op_filename, 'w') as f: f.write('# rules file autogenerated by mapper.py v0.1\n') f.write('filename:source=' + data_filename + '\n') f.write('filename:rule=' + op_filename + '\n\n') for row in l_map: #print('ROW = ' , row) if type(row) is str: f.write(row + '\n') else: for v in row: f.write(v)
def create_sample_projects(): proj1 = project.Project(name='Acute Software', desc='Custom Software development', fldr='') proj1.add_detail('website', 'http://www.acutesoftware.com.au') proj1.add_detail('email', '*****@*****.**') proj2 = project.Project(name='Sales Log', desc='Record list of sales', fldr='') proj2.add_detail('Note', 'List of sales taken from manual entries in test program') tbl_exp = cls_datatable.DataTable('expenses.csv', ',', col_names=['date', 'amount', 'details']) proj2.record(tbl_exp, 'Expense', ['2015-02-13', 49.94, 'restaurant']) proj2.record(tbl_exp, 'Expense', ['2015-02-15', 29.00, 'petrol']) proj2.record(tbl_exp, 'Expense', ['2015-02-17', 89.95, 'fringe tickets']) proj_diary = project.Project(name='Diary', fldr=root_folder, desc='Diary database for PIM application') proj_diary.add_source('Calendar', root_folder) proj_diary.add_source('Bookmarks', root_folder) proj_diary.add_source('File Usage', root_folder) proj_diary.add_source('PC Usage', root_folder) proj_diary.add_source('TODO List', root_folder) all_projects = project.Projects() all_projects.add_project(proj_diary) all_projects.add_project(proj1) all_projects.add_project(proj2) return all_projects
def test_03_get_distinct_values_from_cols3(self): fle = cl.DataTable(fname, ',') fle.load_to_array() dist_cols = fle.get_distinct_values_from_cols(['TERM', 'ID']) self.assertEqual( sorted(dist_cols), sorted([('5300', '00016'), ('5300', '00078'), ('7310', '00016'), ('7310', '00078')]))
def test_04_add_cols(self): fle = cl.DataTable(fname, ',') fle.load_to_array() fle.add_cols(['NEW1', 'NEW2']) #fle.describe_contents() self.assertEqual( fle.get_header(), ['TERM', 'GENDER', 'ID', 'tot1', 'tot2', 'NEW1', 'NEW2'])
def test_07_add_data(self): dat = cl.DataTable('sales.csv', ',', col_names=['date', 'amount', 'details']) dat.add(['2015-01-09', 24.95, 'Timer']) dat.add(['2015-02-17', 45.00, 'Diary']) dat.add(['2015-02-19', 24.95, 'Timer']) self.assertEqual(len(dat.arr), 3) self.assertEqual(len(dat.count_unique_values(0, 'date')), 3) self.assertEqual(len(dat.count_unique_values(2, 'details')), 2)
def test_02_percentile(self): fl3 = cl.DataTable('', '"') self.assertEqual(fl3.percentile([1, 2, 3, 4, 5, 6], .25), 2.25) self.assertEqual(fl3.percentile([1, 2, 3, 4, 5, 6], .5), 3.5) self.assertEqual(fl3.percentile([1, 2, 3, 4, 5, 6], .75), 4.75) self.assertEqual(fl3.percentile([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], .25), 3.25) self.assertEqual(fl3.percentile([1, 1, 2], .5), 1) self.assertEqual(fl3.percentile([1, 1, 2], .25), 1) self.assertEqual(fl3.percentile([1, 1, 2], .75), 1.5)
def main(): fname = 'journal.csv' p = mod_prj.Project('Journal Record') print(p.nme) # Attempt #1 - using DataTable directly (TOK) dt = mod_dat.DataTable(fname, ',', col_names=['date', 'category', 'details']) dt.add(['2017-12-07', 'Software', 'test online version']) dt.add(['2017-06-11', 'Software', 'update readme']) dt.add(['2015-05-11', 'Shopping', 'bought jeans']) print(dt) """ date category details 11/05/2015 Software creating LP_ADD_DATA.py to record journal to diary 11/05/2015 Software update readme 11/05/2015 Shopping bought jeans """ dt.save_csv(fname) # attempt #2 using Core DATA (TOK) e = mod_core.CoreDataWhen('Sales Meeting', [ '2015-04-11', 'Office', 'Meet with client to discuss custom software' ]) print(e.format_csv()) # attempt #3 use an Events class to manage it all ev = Events(os.getcwd(), 'D', 'DAT') ev.add( mod_core.CoreDataWhen('Sales Meeting', ['2014-01-11', 'Office', 'Catchup with client'])) ev.add( mod_core.CoreDataWhen('Sales Meeting#3', ['2015-03-11', 'Office', 'Catchup with client'])) ev.add( mod_core.CoreDataWhen( 'DEV AIKIF - core data', ['2015-05-11', 'Software', 'update TEST - no test for CORE_DATA'])) ev.add( mod_core.CoreDataWhen( 'DEV LifePim - core data', ['2015-03-11', 'Software', 'use data for LifePim'])) ev.add( mod_core.CoreDataWhen('DEV AIKIF - data tools', ['2015-05-11', 'Software', 'fix data tools '])) print(ev) ev.save() txt = 'Catchup' # 'data' print('\n Searching for ', txt) srch = ev.find(txt) for s in srch: print(s) # s.data[2]
def _create_from_csv(self): """ create a standard data object based on CSV file """ import aikif.dataTools.cls_datatable as cl fle = cl.DataTable(self.input_data, ',') fle.load_to_array() self.content['data'] = fle.arr lg.record_process('_create_from_csv', 'read ' + self._calc_size_stats() + ' from ' + self.input_data)
def make_table(f_all, tbl, cols): tname = 'CREATE_' + tbl t = mod_table.DataTable(tbl, fldr + tbl + '.csv', cols) #t.save_csv(fldr + os.sep + tbl + '.csv') # create the SQL of a file t = SQLCodeGenerator(tbl) t.set_column_list(cols) t.create_script_fact() t.create_index(tbl, cols) t.save_ddl(fldr + os.sep + tname + '.SQL') f_all.write('@' + tname + '.SQL;\n')
def load_random_tables(d): root_folder = os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + os.sep + '..' + os.sep + '..' + os.sep + 'data') fname = root_folder + os.sep + 'temp' + os.sep + 'TEMP_LOAD_TESTING.csv' create_test_file(fname) dt = mod_dt.DataTable(fname, ',') dt.load_to_array() num_loads = 400 for load in range(20, 20 + num_loads): schema = 'tst' + str(load).zfill(5) d.import_datatable(dt, schema, 0) print('dbsize=', d.connection.dbsize(), ' Total memory=', d.connection.info()['used_memory_human'])
def create_test_file(fname): """ create a very large random test file """ colLabel = [ 'id2', 'DATE', 'name', 'surname', 'Born', 'Location', 'Quote', 'Score', 'Points' ] colTypes = [ 'STRING', 'DATE', 'PEOPLE', 'PEOPLE', 'PLACE', 'PLACE', 'WORD', 'INT', 'INT' ] test_datatable = mod_dt.DataTable(fname, ',') test_datatable.arr = mod_gen.random_table(9, 100000, colTypes, colLabel) test_datatable.header = colLabel test_datatable.save_csv(fname, False)
def test_50_extract_csv_to_fact(self): """ read a CSV file to facts and parse it to CoreData obects """ import aikif.dataTools.cls_datatable as cl fle = cl.DataTable(os.path.join(pth, 'data', 'core', 'LOCATION_WORLD.csv'), ',') fle.load_to_array() csv_res = '' for row in fle.arr: r = mod_core.CoreDataWhy(row[1], [{'code':row[1],'name':row[2]}]) csv_res += r.format_csv() self.assertEqual(r.name, row[1]) self.assertTrue('"COD","' in csv_res) self.assertTrue('\'name\': \'LABEL\'' in csv_res) self.assertTrue('\'code\': \'COD\'' in csv_res) self.assertTrue('\'West Bank\'' in csv_res) self.assertEqual(len(csv_res), 12163)
def test_02_record(self): proj2 = project.Project(name='Sales Log', desc='Record list of sales', fldr='') proj2.add_detail( 'Note', 'List of sales taken from manual entries in test program') self.assertEqual(proj2.details[0][0], 'Note') self.assertEqual( proj2.details[0][1], 'List of sales taken from manual entries in test program') tbl_exp = cls_datatable.DataTable( 'expenses.csv', ',', col_names=['date', 'amount', 'details']) proj2.record(tbl_exp, 'Expense', ['2015-02-13', 49.94, 'restaurant']) proj2.record(tbl_exp, 'Expense', ['2015-02-15', 29.00, 'petrol']) proj2.record(tbl_exp, 'Expense', ['2015-02-17', 89.95, 'fringe tickets']) proj2.log_table(tbl_exp) proj2.build_report('task.rst', 'rst') proj2.build_report('task.md', 'md') proj2.build_report('task.html', 'html') self.assertEqual(len(tbl_exp.arr), 3) self.assertEqual(tbl_exp.arr[1][2], 'petrol')
def main(): p = mod_prj.Project('Allen_AI', tpe='Software', fldr=op_folder, desc='Kaggle competetion entry for Allen_AI') p.add_detail('kaggle_url', 'https://www.kaggle.com/c/the-allen-ai-science-challenge') p.add_detail('files_root_folder', root_folder) p.add_detail('files_src_data', src_data) p.add_detail('files_op_folder', op_folder) p.add_detail('date_last_ran', mod_dt.TodayAsString()) lookup_src = mod_dt.DataTable('lookup_src.csv', ',', col_names=['name', 'url']) p.log_table(lookup_src) p.record(lookup_src, '', [ 'page-resources', 'http://aclweb.org/aclwiki/index.php?title=RTE_Knowledge_Resources#Publicly_available_Resources' ]) p.record(lookup_src, '', ['science_notes', 'http://www.ck12.org/']) p.record(lookup_src, '', [ 'data-wikipedia', 'https://en.wikipedia.org/wiki/Wikipedia:Database_download' ]) p.record(lookup_src, '', ['YAGO', 'http://yago-knowledge.org']) p.record(lookup_src, '', ['Dbpedia', 'http://dbpedia.org']) p.record(lookup_src, '', ['Freebase', 'http://freebase.com']) p.record(lookup_src, '', ['Entitycube', 'http://entitycube.research.microsoft.com']) p.record(lookup_src, '', ['renlifang', 'http://renlifang.msra.cn']) p.record(lookup_src, '', ['NELL', 'http://rtw.ml.cmu.edu']) p.record(lookup_src, '', ['DeepDive', 'http://deepdive.stanford.edu']) p.record( lookup_src, '', ['Probase', 'http://research.microsoft.com/en-us/projects/probase/']) p.record(lookup_src, '', ['KnowItAll', 'http://openie.cs.washington.edu']) p.record(lookup_src, '', ['ReVerb', 'http://reverb.cs.washington.edu']) p.record(lookup_src, '', ['BabelNet', 'http://babelnet.org']) p.record( lookup_src, '', ['WikiNet', 'http://www.h-its.org/english/research/nlp/download/']) p.record(lookup_src, '', ['ConceptNet', 'http://conceptnet5.media.mit.edu']) p.record(lookup_src, '', ['WordNet', 'http://wordnet.princeton.edu']) p.record(lookup_src, '', ['Linked Open Data', 'http://linkeddata.org']) progress = mod_dt.DataTable('progress.csv', ',', col_names=['program', 'percent', 'details']) p.log_table(progress) p.record( progress, '', ['Source data download', '100%', 'download competition sample data']) p.record(progress, '', ['Allen_AI_install.py', '0%', 'downloads lookup data, unzips']) p.record(progress, '', ['Allen_AI_run.py', '2%', 'main script to run the program']) p.record( progress, '', ['method1.py', '5%', 'method to answer questions using heuristic #1']) p.record( progress, '', ['method2.py', '0%', 'method to answer questions using heuristic #2']) p.record( progress, '', ['method3.py', '0%', 'method to answer questions using heuristic #3']) t = mod_tool.Toolbox() t.add({ 'file': 'method1.py', 'function': 'solve', 'args': ['list'], 'return': ['int'] }) t.add({ 'file': 'method2.py', 'function': 'solve', 'args': ['list'], 'return': ['int'] }) t.add({ 'file': 'method3.py', 'function': 'solve', 'args': ['list'], 'return': ['int'] }) results = mod_dt.DataTable( 'results.csv', ',', col_names=['program', 'function', 'param', 'result', 'date_ran']) p.log_table(results) for tool_num, tool in enumerate(t.lstTools): print('tool=', tool) for param in params: result = t.run(tool, ['test' + str(tool_num), param], root_folder) p.record(results, '', [ tool['file'], tool['function'], param, result, mod_dt.TodayAsString() ]) results.save_csv(os.path.join(root_folder, 'allen_AI_results.csv')) p.build_report('allen_AI.rst', tpe='rst') print('Done...')
def test_03_get_distinct_values_from_cols2(self): fle = cl.DataTable(fname, ',') fle.load_to_array() dist_cols = fle.get_distinct_values_from_cols(['TERM']) self.assertEqual(sorted(dist_cols), [{'5300', '7310'}])
def test_03_get_distinct_values_from_cols1(self): fle = cl.DataTable(fname, ',') fle.load_to_array() dist_cols = fle.get_distinct_values_from_cols(['GENDER']) self.assertEqual(sorted(dist_cols), [{'F', 'M'}])
def main(): """ This is an example of project documentation using AIKIF It documents the project itself, including requirements, design, test, goals, """ print('Initialising AIKIF Project...') name = 'AIKIF' type = 'Software' desc = """ Artificial Intelligence Knowledge Information Framework - Project Overview This document was autogenerated via aikif/examples/AIKIF_project.py """ desc += '\n Last updated ' + mod_dt.TodayAsString() fldr = os.getcwd( ) # 'T:\\user\\dev\\src\\python\\AIKIF\\aikif\\examples\\AIKIF_project' report_file_base = fldr + os.sep + 'aikif_report' p = mod_prj.Project(name, type, desc, fldr) project_setup(p) requirements = mod_dt.DataTable( 'requirements.csv', ',', col_names=['id', 'dep_id', 'name', 'details']) p.log_table(requirements) p.record(requirements, '', [ 'a', '', 'process data', 'automatically process source files to tables based on rules' ]) p.record(requirements, '', [ 'b', '', 'define structures', 'use mappings and ontology to specify what to do' ]) p.record(requirements, '', ['c', '', 'log intent', 'log key events']) p.record(requirements, '', [ 'd', '', 'methods toolbox', 'implement a set of programs that can be used generically' ]) p.record(requirements, '', ['a01', 'a', 'download CSV', 'download CSV file from website']) p.record( requirements, '', ['a02', 'a', 'load CSV to table', 'import CSV file to Database table']) p.record(requirements, '', [ 'a03', 'a', 'find blank rows in CSV', 'read CSV file and count DQ issues' ]) p.record( requirements, '', ['a04', 'a', 'aggregate table', 'summarise Database table by col(n)']) issues = mod_dt.DataTable('issues.csv', ',', col_names=['id', 'name', 'details']) p.log_table(issues) p.record(issues, '', ['01', 'todo', 'implement AIKIF project logging']) # p.build_report(report_file_base + '.md', type='md') p.build_report(report_file_base + '.rst', type='rst') print('Done...')