Пример #1
0
def csv2sql(data_frame, schema, output_path):
    engine = create_engine('sqlite://', echo=False)
    data_frame = data_frame.fillna(data_frame.mean())
    sql_schema = {}
    for col in schema['columns']:
        colname = col['name']
        coltype = col['type']
        coltype = column_types.get(coltype).sql_type
        if '(' in coltype:
            coltype, arg = coltype.split('(')
            arg = '(' + arg[:-1] + ',)'
            coltype = getattr(types, coltype)(*(ast.literal_eval(arg)))
        else:
            coltype = getattr(types, coltype)()
        sql_schema[colname] = coltype
    data_frame.to_sql(schema['name'].lower(),
                      con=engine,
                      if_exists='replace',
                      dtype=sql_schema)
    print("Dumping Table", schema["name"])
    conn = engine.connect()

    with open(output_path + '.sql', 'w') as stream:
        for line in conn.connection.iterdump():
            stream.write(line)
            stream.write('\n')
Пример #2
0
 def get_sql_query(self,csv,q):
     sf=self.slot_fill(csv, q)
     sub_clause=''' WHERE {} = "{}" '''
     schema=self.data_process.get_schema_for_csv(csv)
     
     sf_columns=[i[0] for i in sf]
 
     ex_kwd=self.kword_extractor(q)
     unknown_slot,flag=self.unknown_slot_extractor(schema,sf_columns,ex_kwd)
     clause=Clause()
     question=""
 
     if flag: 
         for col in schema["columns"]:
             if "priority" in col.keys() and flag:
                 question=clause.adapt(q,inttype=True,priority=True)
                 
             else:   
                 question=clause.adapt(q,inttype=True)
 
     else:
         question=clause.adapt(q)
     if unknown_slot is None:
         unknown_slot='*'
     question=question.format(unknown_slot,schema["name"].lower())
     
     valmap = {}
     def get_key(val):
         return f"val_{len(valmap)}"
     print(sf)
     for i,s in enumerate(sf):
         col,val=s[0],s[2]
         typ = column_types.get(s[1])
         if i>0:
             sub_clause='''AND {} = "{}" '''
         if issubclass(typ, column_types.Number):
             val=self.cond_map(val)
             
         if issubclass(typ, column_types.String):
             k = get_key(val)
             valmap[k] = val
         else:
             k = val
         
 
         if any(i in conditions.keys() for i in k):
             
             subq=sub_clause.format(col, k)
             subq=subq.replace('=','')
             subq=subq.replace('"','')
         else:
             subq=sub_clause.format(col, k)
         
         
         question+=subq            
 
     
     return question, valmap
Пример #3
0
 def query_db(self,question):
     engine = create_engine('sqlite://', echo=False)
     csv=self.nlp.csv_select(question)
     data_frame=self.data_process.get_dataframe(csv).astype(str)
     schema=self.data_process.get_schema_for_csv(csv)
 
     data_frame = data_frame.fillna(data_frame.mean())
     sql_schema = {}
     for col in schema['columns']:
         colname = col['name']
         coltype = col['type']
         coltype = column_types.get(coltype).sql_type
         if '(' in coltype:
             coltype, arg = coltype.split('(')
             arg ='(' + arg[:-1] + ',)'
             coltype = getattr(types, coltype)(*(ast.literal_eval(arg)))
         else:
             coltype = getattr(types, coltype)()
         sql_schema[colname] = coltype
     data_frame.to_sql(schema['name'].lower(), con=engine, if_exists='replace', dtype=sql_schema)
     agent=Agent(self.data_dir,self.schema_dir)
     query=agent.get_query(question)      
     return engine.execute(query).fetchall()
Пример #4
0
 def _is_numeric(typ):
     # TODO
     return issubclass(column_types.get(typ), column_types.Number)
Пример #5
0
 def slot_fill(self,csv, q):
     # example: slot_fill(get_csvs()[2], "how many emarati men of age 22 died from stomach cancer in 2012")
     schema = self.data_process.get_schema_for_csv(csv)
     def _is_numeric(typ):
         # TODO
         return issubclass(column_types.get(typ), column_types.Number)
     slots = []
     mappings = {}
     for col in schema['columns']:
         colname = col['name']
         if 'keywords' in col.keys():
             keyword=col['keywords'][0]
             q=q.replace(colname,keyword)
         else:
             keyword=colname
         if colname == 'index':
             continue
         coltype = col['type']
         if coltype == "Categorical":
             mappings[colname] = col["mapping"]
 
         if _is_numeric(coltype):
             colquery="number of {}".format(keyword)
         else:
             colquery="which {}".format(keyword)
         
         val, score = qa(q, colquery, return_score=True)
         vt =  nltk.word_tokenize(val)
         start_idx = _find(nltk.word_tokenize(q), vt)
         end_idx = start_idx + len(vt) - 1
         print("filling slots:",colname, val, score)
         slots.append((colname, coltype, val, score, start_idx, end_idx))
     slots.sort(key=lambda x: -x[3])
     windows = []
     slots_filtered = []
     for s in slots:
         if s[-2] < 0:
             continue
         win = s[-2:]
         flag = False
         for win2 in windows:
             if _window_overlap(*(win + win2)):
                 flag = True
                 break
         if flag:
             continue
         windows.append(win)
         slots_filtered.append(s[:-2])
     slots = slots_filtered
 
     ret = []
     for s in slots:
         if s[1] == "FuzzyString":
             vals = values[s[0]]
             fs = column_types.FuzzyString(vals, exclude=s[0].split('_'))
             val = fs.adapt(s[2])
         elif s[1] == "Categorical":
             cat = column_types.Categorical(mappings[s[0]])
             val = cat.adapt(s[2])
         elif _is_numeric(s[1]):
 
             val = column_types.get(s[1])().adapt(s[2], context=q, allowed_kws=[s[0]])
         else:
             val = column_types.get(s[1])().adapt(s[2])
         if val is not None:
             ret.append((s[0], s[1], val, s[3]))
 
     return ret