def csv2sql(data_frame, schema, output_path): engine = create_engine('sqlite://', echo=False) data_frame = data_frame.fillna(data_frame.mean()) sql_schema = {} for col in schema['columns']: colname = col['name'] coltype = col['type'] coltype = column_types.get(coltype).sql_type if '(' in coltype: coltype, arg = coltype.split('(') arg = '(' + arg[:-1] + ',)' coltype = getattr(types, coltype)(*(ast.literal_eval(arg))) else: coltype = getattr(types, coltype)() sql_schema[colname] = coltype data_frame.to_sql(schema['name'].lower(), con=engine, if_exists='replace', dtype=sql_schema) print("Dumping Table", schema["name"]) conn = engine.connect() with open(output_path + '.sql', 'w') as stream: for line in conn.connection.iterdump(): stream.write(line) stream.write('\n')
def get_sql_query(self,csv,q): sf=self.slot_fill(csv, q) sub_clause=''' WHERE {} = "{}" ''' schema=self.data_process.get_schema_for_csv(csv) sf_columns=[i[0] for i in sf] ex_kwd=self.kword_extractor(q) unknown_slot,flag=self.unknown_slot_extractor(schema,sf_columns,ex_kwd) clause=Clause() question="" if flag: for col in schema["columns"]: if "priority" in col.keys() and flag: question=clause.adapt(q,inttype=True,priority=True) else: question=clause.adapt(q,inttype=True) else: question=clause.adapt(q) if unknown_slot is None: unknown_slot='*' question=question.format(unknown_slot,schema["name"].lower()) valmap = {} def get_key(val): return f"val_{len(valmap)}" print(sf) for i,s in enumerate(sf): col,val=s[0],s[2] typ = column_types.get(s[1]) if i>0: sub_clause='''AND {} = "{}" ''' if issubclass(typ, column_types.Number): val=self.cond_map(val) if issubclass(typ, column_types.String): k = get_key(val) valmap[k] = val else: k = val if any(i in conditions.keys() for i in k): subq=sub_clause.format(col, k) subq=subq.replace('=','') subq=subq.replace('"','') else: subq=sub_clause.format(col, k) question+=subq return question, valmap
def query_db(self,question): engine = create_engine('sqlite://', echo=False) csv=self.nlp.csv_select(question) data_frame=self.data_process.get_dataframe(csv).astype(str) schema=self.data_process.get_schema_for_csv(csv) data_frame = data_frame.fillna(data_frame.mean()) sql_schema = {} for col in schema['columns']: colname = col['name'] coltype = col['type'] coltype = column_types.get(coltype).sql_type if '(' in coltype: coltype, arg = coltype.split('(') arg ='(' + arg[:-1] + ',)' coltype = getattr(types, coltype)(*(ast.literal_eval(arg))) else: coltype = getattr(types, coltype)() sql_schema[colname] = coltype data_frame.to_sql(schema['name'].lower(), con=engine, if_exists='replace', dtype=sql_schema) agent=Agent(self.data_dir,self.schema_dir) query=agent.get_query(question) return engine.execute(query).fetchall()
def _is_numeric(typ): # TODO return issubclass(column_types.get(typ), column_types.Number)
def slot_fill(self,csv, q): # example: slot_fill(get_csvs()[2], "how many emarati men of age 22 died from stomach cancer in 2012") schema = self.data_process.get_schema_for_csv(csv) def _is_numeric(typ): # TODO return issubclass(column_types.get(typ), column_types.Number) slots = [] mappings = {} for col in schema['columns']: colname = col['name'] if 'keywords' in col.keys(): keyword=col['keywords'][0] q=q.replace(colname,keyword) else: keyword=colname if colname == 'index': continue coltype = col['type'] if coltype == "Categorical": mappings[colname] = col["mapping"] if _is_numeric(coltype): colquery="number of {}".format(keyword) else: colquery="which {}".format(keyword) val, score = qa(q, colquery, return_score=True) vt = nltk.word_tokenize(val) start_idx = _find(nltk.word_tokenize(q), vt) end_idx = start_idx + len(vt) - 1 print("filling slots:",colname, val, score) slots.append((colname, coltype, val, score, start_idx, end_idx)) slots.sort(key=lambda x: -x[3]) windows = [] slots_filtered = [] for s in slots: if s[-2] < 0: continue win = s[-2:] flag = False for win2 in windows: if _window_overlap(*(win + win2)): flag = True break if flag: continue windows.append(win) slots_filtered.append(s[:-2]) slots = slots_filtered ret = [] for s in slots: if s[1] == "FuzzyString": vals = values[s[0]] fs = column_types.FuzzyString(vals, exclude=s[0].split('_')) val = fs.adapt(s[2]) elif s[1] == "Categorical": cat = column_types.Categorical(mappings[s[0]]) val = cat.adapt(s[2]) elif _is_numeric(s[1]): val = column_types.get(s[1])().adapt(s[2], context=q, allowed_kws=[s[0]]) else: val = column_types.get(s[1])().adapt(s[2]) if val is not None: ret.append((s[0], s[1], val, s[3])) return ret