Пример #1
0
def write_table(data_frame,
                table_name,
                sep=',',
                iotype='fs',
                remove_tmpfile=True):

    if iotype == 'fs':
        data_frame.to_csv(nnenv.getResourcePath() + nnenv.getItem(table_name),
                          sep=sep,
                          index=False)

    elif iotype == 'db':

        ##initlize data engine
        engine = create_engine(nnenv.getConnectable())
        ##write data frame to csv tmp file
        path_tmp_file = nnenv.getItem('tmp_dir') + '/' + nnenv.getItem(
            table_name)
        data_frame.to_csv(path_tmp_file, index=False, header=False)
        ##connect database
        conn = engine.connect()
        ##initilize hive_sql
        hive_sql_ = 'LOAD DATA LOCAL INPATH \'' + path_tmp_file + '\' OVERWRITE INTO TABLE ' + nnenv.getItem(
            table_name)

        ##execute hive_sql
        result = conn.execute(hive_sql_)
        result.close()

    else:
        print('IOtype is only for db or fs')
        raise (Exception)
Пример #2
0
def main():
    # pre-define path & variables
    corpus_raw = nn.Dataframefactory('labeledContent',sep = '|',iotype='db',con=nnenv.getItem('mysql_url'))
    vector = "vectorizer.joblib"
    matrix = "tfidf.npy"
    outpath = nnenv.getResourcePath() 
    
    
    # load dict and stopwords
    createDictStop()
    
    # load corpus/
    corpus = combineTitleAndContent(corpus_raw)
    

    # save content_id mapping
    content_id_mapping = corpus[["content_id"]]
    content_id_mapping.index.name = 'index'
    content_id_mapping.to_csv(outpath + nnenv.getItem('content_id_mapping')) 


    # transform corpus to right format
    corpus["corpus"] = corpus["all"].apply(segment)
    
    #create tfidf-matrix and vectorizer
    tfidfMatrix, vectorizer = createTfidfMatrix(corpus)
    
    #save esstenial files
    with open(outpath + vector, 'wb') as f:
        joblib.dump(vectorizer, f)
    
    np.save(outpath + matrix, tfidfMatrix)
    
    print("new tfidf_matrix and vectorizer have been saved into {""}".format(outpath))
Пример #3
0
def Dataframefactory(table_name, sep=',', iotype='fs'):
    ##directly return Pandas dataframe
    if iotype == 'fs':
        return (pd.read_csv(nnenv.getResourcePath() +
                            nnenv.getItem(table_name),
                            sep=sep))
    if iotype == 'db':
        return (pd.read_sql_table(table_name=nnenv.getItem(table_name),
                                  con=nnenv.getConnectable()))
    else:
        print('IOtype is only for db or fs')
        raise (Exception)
Пример #4
0
def Dataframefactory(table_name,
                     sep=',',
                     iotype='fs',
                     con=nnenv.getConnectable()):
    ##directly return Pandas dataframe
    if iotype == 'fs':
        return (pd.read_csv(nnenv.getResourcePath() +
                            nnenv.getItem(table_name),
                            sep=sep,
                            engine='python'))
    if iotype == 'db':
        return (pd.read_sql_table(table_name=nnenv.getItem(table_name),
                                  con=con))
Пример #5
0
def Joblibfactory(vectorizer):
    return (joblib.load(nnenv.getResourcePath() + vectorizer))
Пример #6
0
def Numpyarrayfactory(np_name):
    return (np.load(nnenv.getResourcePath() + np_name))