def test_bootstrap() : corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL LIMIT 2000;','../../Data/dumps/20160126_cybergeo.sqlite3') for kwLimit in [50,100,200] : for subCorpusSize in [100,500,1000,2000] : bootstrapSize=25 [relevantkw,relevant_dico,allkw] = bootstrap_subcorpuses(corpus,kwLimit,subCorpusSize,bootstrapSize) utils.export_dico_csv(relevant_dico,'res/conv_dico/bootstrap_relevantDico_kwLimit'+str(kwLimit)+'_subCorpusSize'+str(subCorpusSize)+'_bootstrapSize'+str(bootstrapSize),True) utils.export_list(relevantkw,'res/conv_kw/kw_'+str(kwLimit)+'_subCorpusSize'+str(subCorpusSize),False) utils.export_dico_num_csv(relevantkw,'res/conv_tm/kw_'+str(kwLimit)+'_subCorpusSize'+str(subCorpusSize),False) for i in range(len(allkw)) : local_kw = allkw[i] utils.export_list(local_kw.keys(),'res/conv_kw/kw_'+str(kwLimit)+'_subCorpusSize'+str(subCorpusSize)+'_run'+str(i),False) utils.export_dico_num_csv(local_kw,'res/conv_tm/kw_'+str(kwLimit)+'_subCorpusSize'+str(subCorpusSize)+'_run'+str(i),False)
def extract_relevant_cybergeo(kwLimit, database): corpus = utils.get_data( "SELECT cybergeo.id FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!='';", database, ) print(corpus) occurence_dicos = utils.import_kw_dico_req( "SELECT cybergeo.id,abstract_keywords FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!='';", database, ) print(occurence_dicos) [relevantkw, relevant_dico] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos) utils.export_dico_csv(relevant_dico, "res/cybergeo/relevantDico_kwLimit" + str(kwLimit), False) utils.export_dico_num_csv(relevantkw, "res/cybergeo/kw_" + str(kwLimit), False)
def extract_relevant_cybergeo(kwLimit, database): corpus = utils.get_data( 'SELECT cybergeo.id FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!=\'\';', database) print(corpus) occurence_dicos = utils.import_kw_dico_req( 'SELECT cybergeo.id,abstract_keywords FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!=\'\';', database) print(occurence_dicos) [relevantkw, relevant_dico ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos) utils.export_dico_csv(relevant_dico, 'res/cybergeo/relevantDico_kwLimit' + str(kwLimit), False) utils.export_dico_num_csv(relevantkw, 'res/cybergeo/kw_' + str(kwLimit), False)