def write_table(data_frame, table_name, sep=',', iotype='fs', remove_tmpfile=True): if iotype == 'fs': data_frame.to_csv(nnenv.getResourcePath() + nnenv.getItem(table_name), sep=sep, index=False) elif iotype == 'db': ##initlize data engine engine = create_engine(nnenv.getConnectable()) ##write data frame to csv tmp file path_tmp_file = nnenv.getItem('tmp_dir') + '/' + nnenv.getItem( table_name) data_frame.to_csv(path_tmp_file, index=False, header=False) ##connect database conn = engine.connect() ##initilize hive_sql hive_sql_ = 'LOAD DATA LOCAL INPATH \'' + path_tmp_file + '\' OVERWRITE INTO TABLE ' + nnenv.getItem( table_name) ##execute hive_sql result = conn.execute(hive_sql_) result.close() else: print('IOtype is only for db or fs') raise (Exception)
def main(): # pre-define path & variables corpus_raw = nn.Dataframefactory('labeledContent',sep = '|',iotype='db',con=nnenv.getItem('mysql_url')) vector = "vectorizer.joblib" matrix = "tfidf.npy" outpath = nnenv.getResourcePath() # load dict and stopwords createDictStop() # load corpus/ corpus = combineTitleAndContent(corpus_raw) # save content_id mapping content_id_mapping = corpus[["content_id"]] content_id_mapping.index.name = 'index' content_id_mapping.to_csv(outpath + nnenv.getItem('content_id_mapping')) # transform corpus to right format corpus["corpus"] = corpus["all"].apply(segment) #create tfidf-matrix and vectorizer tfidfMatrix, vectorizer = createTfidfMatrix(corpus) #save esstenial files with open(outpath + vector, 'wb') as f: joblib.dump(vectorizer, f) np.save(outpath + matrix, tfidfMatrix) print("new tfidf_matrix and vectorizer have been saved into {""}".format(outpath))
def Dataframefactory(table_name, sep=',', iotype='fs'): ##directly return Pandas dataframe if iotype == 'fs': return (pd.read_csv(nnenv.getResourcePath() + nnenv.getItem(table_name), sep=sep)) if iotype == 'db': return (pd.read_sql_table(table_name=nnenv.getItem(table_name), con=nnenv.getConnectable())) else: print('IOtype is only for db or fs') raise (Exception)
def Dataframefactory(table_name, sep=',', iotype='fs', con=nnenv.getConnectable()): ##directly return Pandas dataframe if iotype == 'fs': return (pd.read_csv(nnenv.getResourcePath() + nnenv.getItem(table_name), sep=sep, engine='python')) if iotype == 'db': return (pd.read_sql_table(table_name=nnenv.getItem(table_name), con=con))
def Joblibfactory(vectorizer): return (joblib.load(nnenv.getResourcePath() + vectorizer))
def Numpyarrayfactory(np_name): return (np.load(nnenv.getResourcePath() + np_name))