def write_table(data_frame, table_name, sep=',', iotype='fs', remove_tmpfile=True): if iotype == 'fs': data_frame.to_csv(nnenv.getResourcePath() + nnenv.getItem(table_name), sep=sep, index=False) elif iotype == 'db': ##initlize data engine engine = create_engine(nnenv.getConnectable()) ##write data frame to csv tmp file path_tmp_file = nnenv.getItem('tmp_dir') + '/' + nnenv.getItem( table_name) data_frame.to_csv(path_tmp_file, index=False, header=False) ##connect database conn = engine.connect() ##initilize hive_sql hive_sql_ = 'LOAD DATA LOCAL INPATH \'' + path_tmp_file + '\' OVERWRITE INTO TABLE ' + nnenv.getItem( table_name) ##execute hive_sql result = conn.execute(hive_sql_) result.close() else: print('IOtype is only for db or fs') raise (Exception)
def main(): # pre-define path & variables corpus_raw = nn.Dataframefactory('labeledContent',sep = '|',iotype='db',con=nnenv.getItem('mysql_url')) vector = "vectorizer.joblib" matrix = "tfidf.npy" outpath = nnenv.getResourcePath() # load dict and stopwords createDictStop() # load corpus/ corpus = combineTitleAndContent(corpus_raw) # save content_id mapping content_id_mapping = corpus[["content_id"]] content_id_mapping.index.name = 'index' content_id_mapping.to_csv(outpath + nnenv.getItem('content_id_mapping')) # transform corpus to right format corpus["corpus"] = corpus["all"].apply(segment) #create tfidf-matrix and vectorizer tfidfMatrix, vectorizer = createTfidfMatrix(corpus) #save esstenial files with open(outpath + vector, 'wb') as f: joblib.dump(vectorizer, f) np.save(outpath + matrix, tfidfMatrix) print("new tfidf_matrix and vectorizer have been saved into {""}".format(outpath))
def Dataframefactory(table_name, sep=',', iotype='fs'): ##directly return Pandas dataframe if iotype == 'fs': return (pd.read_csv(nnenv.getResourcePath() + nnenv.getItem(table_name), sep=sep)) if iotype == 'db': return (pd.read_sql_table(table_name=nnenv.getItem(table_name), con=nnenv.getConnectable())) else: print('IOtype is only for db or fs') raise (Exception)
def Dataframefactory(table_name, sep=',', iotype='fs', con=nnenv.getConnectable()): ##directly return Pandas dataframe if iotype == 'fs': return (pd.read_csv(nnenv.getResourcePath() + nnenv.getItem(table_name), sep=sep, engine='python')) if iotype == 'db': return (pd.read_sql_table(table_name=nnenv.getItem(table_name), con=con))
def write_mysql_table(data_frame, table_name, con): table_name = nnenv.getItem(table_name) con = nnenv.getItem(con) engine = create_engine(con) con = engine.connect() engine.execute('truncate table ' + table_name) data_frame.to_sql(name=table_name, if_exists='append', con=engine, index=False) con.close() return ('sucessful insert mysql table')
def loading_everything(): global tag, similar, mapping, clf, tfidf_matrix, labeled_corpus, title_list, content_id_mapping createDictStop() tag = nn.Dataframefactory('tag', iotype='fs') similar = nn.Dataframefactory('similar', iotype='fs') mapping = mappingCbind(similar, tag) clf = nn.Joblibfactory(nnenv.getItem('vectorizer')) tfidf_matrix = nn.Numpyarrayfactory(nnenv.getItem('tfidf')) labeled_corpus = nn.Dataframefactory('labeledContent', sep='|', iotype='fs', con=nnenv.getItem('mysql_url')) title_list = labeled_corpus.title.tolist() content_id_mapping = nn.Dataframefactory('content_id_mapping', iotype='fs')
def write_table(data_frame, table_name, iotype='fs'): if iotype == 'db': ##get sqlalchey dataengine from sqlalchemy import create_engine ##initlize data engine engine = create_engine(nnenv.getConnectable()) ##write data frame to csv tmp file path_tmp_file = nnenv.getValue('tmp_dir') + '/' + table_name data_frame.to_csv(path_tmp_file, index=False, header=False) ##connect database conn = engine.connect() ##initilize hive_sql hive_sql_ = 'LOAD DATA LOCAL INPATH \'' + path_tmp_file + '\' OVERWRITE INTO TABLE ' + nnenv.getItem( table_name) ##execute hive_sql result = conn.execute(hive_sql_) result.close() else: raise (ValueError)
import pandas as pd from datetime import datetime import nnenv import nndw ##nndw.write_mysql_table(df,table_name='iqvia_4pe_hcp_recommendation_new',con=nnenv.getItem('mysql_con')) from sqlalchemy import create_engine engine = create_engine(nnenv.getItem('mysql_con')) engine.connect() for item in engine.execute( 'select count(1) from iqvia_4pe_hcp_recommendation_new'): print(item) for item in engine.execute('desc iqvia_4pe_hcp_recommendation_new'): print(item) for item in engine.execute('show index from iqvia_4pe_hcp_recommendation_new'): print(item) from sqlalchemy import create_engine from sqlalchemy.engine import reflection insp = reflection.Inspector.from_engine(engine) for name in insp.get_table_names(): for index in insp.get_indexes(name): print(index) t1 = datetime.now()