def run(): gdc = main.load_gold_data(ConfigBase) gdc = main.transform_gold_data(ConfigBase, gdc) trainer = main.init_trainer(ConfigTdc100, cats_list=gdc.cats_list) main.run_training(config=ConfigTdc100, trainer=trainer, gold_data_container=gdc) trainer = main.init_trainer(ConfigTdc80, cats_list=gdc.cats_list) main.run_training(config=ConfigTdc80, trainer=trainer, gold_data_container=gdc)
def run(): gdc_1 = main.load_gold_data(Config1_1) gdc_1 = main.transform_gold_data(Config1_1, gdc_1) gdc_1 = main.transform_gold_data(Config1_2, gdc_1) gdc = GoldDataContainer(cats_list=gdc_1.cats_list) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1) gdc_2 = main.load_gold_data(Config2) gdc_2 = main.transform_gold_data(Config2, gdc_2) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2) gdc_3 = main.load_gold_data(Config3) gdc_3 = main.transform_gold_data(Config3, gdc_3) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3) gdc_4 = main.load_gold_data(Config4) gdc_4 = main.transform_gold_data(Config4, gdc_4) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4) gdc_5 = main.load_gold_data(Config5) gdc_5 = main.transform_gold_data(Config5, gdc_5) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5) gdc_6 = main.load_gold_data(Config6) gdc_6 = main.transform_gold_data(Config6, gdc_6) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6) trainer = main.init_trainer(ConfigTrain, cats_list=gdc.cats_list) main.run_training(config=ConfigTrain, trainer=trainer, gold_data_container=gdc) embed()
def train(trainer1, trainer2): gdc = main.load_gold_data(ConfigTrainCompareBase) gdc = main.transform_gold_data(ConfigTrainCompareBase, gdc) if trainer1 is None: ConfigTrainCompareBase.should_load_model = False ConfigTrainCompareBase.should_create_model = True trainer1 = main.init_trainer(ConfigTrainCompare1, cats_list=gdc.cats_list) trainer2 = main.init_trainer(ConfigTrainCompare2, cats_list=gdc.cats_list) main.run_training(ConfigTrainCompare1, trainer1, gdc) main.run_training(ConfigTrainCompare2, trainer2, gdc) return trainer1, trainer2
def run(): from trainer.trainer4 import Trainer4 ConfigRoot.trainer_class = Trainer4 ConfigRoot.should_load_model = True ConfigRoot.table_name_ref_articles = "main_ref_articles" trainer = main.init_trainer(ConfigIndex) main.run_model_indexer(ConfigIndex, trainer)
def run(): gdc = main.load_gold_data(ConfigSub) gdc = main.transform_gold_data(ConfigSub, gdc) for i in range(30): if i == 0: ConfigSub.should_load_model = False ConfigSub.should_create_model = True else: ConfigSub.should_load_model = True ConfigSub.should_create_model = False trainer = main.init_trainer(config=ConfigSub, cats_list=gdc.cats_list) main.run_training(ConfigSub, trainer, gdc)
def run(): trainer = main.init_trainer(ConfigSub) main.run_evaluation(ConfigSub, trainer) embed()
def run(): gdc = main.load_gold_data(ConfigSub) gdc = main.transform_gold_data(ConfigSub, gdc) trainer = main.init_trainer(config=ConfigSub, cats_list=gdc.cats_list) main.run_training(ConfigSub, trainer, gdc)
def run(): eval_data_container = main.load_gold_data(ConfigLoadG8) eval_data_container = main.transform_gold_data(ConfigLoadG8, eval_data_container) modelVR = main.init_trainer(ConfigLoadVRModel) main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over the entire dataset g8: \n" ) scores_spacy, scores_manual = modelVR.evaluate(eval_data_container) # only look at those examples that mo9 predicts as either AF=SM or AF=SC modelAF = main.init_trainer(ConfigLoadAFModel) gdis_to_keep = [] for gdi in eval_data_container.gold_data_item_list: doc = modelAF.nlp(gdi.text) for cat in ['AF: Social Companions', 'AF: Soziale Medien']: if doc.cats[cat] > 0.5: gdis_to_keep.append(gdi) break eval_data_container2 = GoldDataContainer() eval_data_container2.cats_list = eval_data_container.cats_list eval_data_container2.gold_data_item_list = gdis_to_keep main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over those texts in g8 that mo9 predicts to be AF=SM or AF=SC: \n" ) scores_spacy2, scores_manual2 = modelVR.evaluate(eval_data_container2) # only look at those examples that were annotated as AF=SM or AF=SC # we need to reload the data to undo the transformation that removes AF eval_data_container = main.load_gold_data(ConfigLoadG8) gdis_to_keep = [] for gdi in eval_data_container.gold_data_item_list: for cat in ['AF: Social Companions', 'AF: Soziale Medien']: if gdi.cats[cat] == 1: gdis_to_keep.append(gdi) break eval_data_container3 = GoldDataContainer() eval_data_container3.cats_list = eval_data_container.cats_list eval_data_container3.gold_data_item_list = gdis_to_keep # now apply the transformation that removes all categories except VR eval_data_container3 = main.transform_gold_data(ConfigLoadG8, eval_data_container3) main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over those texts in g8 that were annotated as AF=SM or AF=SC: \n" ) scores_spacy3, scores_manual3 = modelVR.evaluate(eval_data_container3) embed()
def run(): trainer = main.init_trainer(ConfigIndex) main.run_model_indexer(ConfigIndex, trainer)
def run(): modelRT = main.init_trainer(ConfigLoadRTModel) ske_config = { "ske_rest_url": credentials.ske_rest_url, "ske_corpus_id": credentials.ske_corpus_id, "ske_user": credentials.ske_user, "ske_password": credentials.ske_password } ske_manager.create_session(ske_config) groups = [{ "output_detailed_filename": "Risikotyp_detailed_SC.csv", "output_coarse_filename": "Risikotyp_coarse_SC.csv", "docids": [ "PRESSE_201701291915120115", "STANDARD_200007081907270238", "PRESSE_2018111635E6C549D7", "TTKOMP_20160601033409214140080", "VN_20160813234127320770111", "WZ_20181003_5098459", "WZ_20181003031108870400088", "DIEWIR_20170904A65AE63CC9", "WZ_20160722031108098120017", "PROFIL_201604041421430008", "PRESSE_201603251915080063", "PRESSE_201603250400150004", "STANDARD_20130522194002017620108", "EMEDIA_201711244E597405B3", "KURIER_201805011920500018", "PRESSE_201802101915160118", "SN_20150912110031136500056", "SN_20160729013124288040079", "KURIER_201806081920310032", "STANDARD_20150909194002257560093", "WZ_20170610031246099190029", "FURCHE_2017112314B891ACDA", "PRESSE_201306301915390060", "OOEN_20161018040003118910025", "PRESSE_201310131915430079", "STANDARD_20170217194004061000060", "SN_20160201013102166390056", "NVT_20170827543CFC3659", "KLEINE_20170827041003215460321", "PRESSE_200906071915540116", "WZ_20121227031131015130021", "KLEINE_20170825041003194510320", "STANDARD_20170125194005197690112", "FURCHE_20180509F6639BE236", "MEDIANET_2016030413C1A5E1EC", "NVT_20170825FD0C32D6C8", "STANDARD_20160105194002051940104", "SN_20150710013128181990083", "PRESSE_201611271915120046", "VN_20121215232708007760147", "STANDARD_20171115194502812820041", "STANDARD_201008111925100054", "STANDARD_20150828194002169290150", "WZ_20180609031309193220091", "PRESSE_201704291915140121", "STANDARD_20171108194501866020115", "PROFIL_20170410B94762E951", "KRONE_20171014219271554", "WZ_20180217031310334780077", "PRESSE_201801141915130063", "EMEDIA_20180126F0E083B9A7", "WZ_20151114031310271970073", "KRONE_20170507107200378", "SN_20130619232208096190108", "EMEDIA_201302221753240056", "TREND_201503230956040007", "KURIER_201609080401410126", "FALTER_201703224B7E7AC4F1", "STANDARD_20180321194502095040050", "WIENERIN_2018072680706B2C87", "MWVOLL_20170914_8820361", "WZ_20160416031109313060093", "STANDARD_20161013194004252090046", "WZ_20180525C9E368BFD8", "WZ_200107201913270001", "KURIER_201609091920420072", "SN_20170914013122005340117", "FURCHE_201205311643170039", "KURIER_201707061920410065", "PRESSE_201809291915120178", "KURIER_201612211920400070", "STANDARD_20160326194003226570109", "KURIER_201511051920580101", "FORMATDB_20161111BC7EA0D6BB", "OOEN_20120522040006076900046", "WZ_20140124031310182590078", "KURIER_201708261920360109", "TT_201605016250DBF067", "WIBLATT_201608225EEF91EB48", "KURIER_200307111621030038", "KRONE_20170929508340752", "WZ_20150121031316298470064", "WZ_20171104031109152100017", "KLEINE_201710284F3466E472", "SN_20170629013050252890117", "KLEINE_20171125041005034520361", "MEDIANET_201404259B6F8F2B7E", "EMEDIA_20181130F6382BD619", "OESTERREICH_20180926975ED551E3", "MWVOLL_20030917190000603100", "KURIER_201808281920480044", "STANDARD_200107071907170162", "PRESSE_200212280218170053", "KURIER_1996012218530147", "PRESSE_201603131915150054", "KURIER_201801280401260080", "PRESSE_201704121915090007", "FORMATDB_201611110F8589863A", "PRESSE_201306111915210063", "KURIER_201712080401330081", "OOEN_20160305040002039760040", "STANDARD_20181027194502239080114", "DIEWIR_201109051311500026", "WZ_20180214031308504950066", "DATUM_201202011605500020", "OOEN_20160305B035521226", "STANDARD_20161109194007308140095", "STANDARD_20160504194003235260116", "HOR_20181116CB6F841543", "KRONE_20181023142940120", "KURIER_201209151717240207", "FORMATDB_20150925A7BD99FFF2", "PRESSE_200704061849390083", "KLEINE_20150501041003227740229", "TTKOMP_20160719020245298030064", "KURIER_201405311831490009", "KURIER_201801030401370268", "SN_20170210013038072830002", "KURIER_201803081920380154", "KURIER_201709210401550411", "SN_201003132325000157", "MWVOLL_20180628_9981783", "KLEINE_20170614041003015480051", "PRESSE_200812191915420113", "PROFIL_199712200615043", "PRESSE_201210281916120151", "EMEDIA_201812144CB9AE62CE", "SN_201103192323490015", "WZ_20160820031110268600067", "KURIER_201701290401280053", "WIENERIN_20161124C7FD632CA1", "PRESSE_201605041915110076", "OOEN_20170117040003077910020", "STANDARD_20170722194501271440089", "SN_199707192023320035", "MWVOLL_20170112_7821144", "KURIER_201705121920390061", "TT_20180123C9871AE503", "PROFIL_201812178E7D0688AB", "WZ_20181229_5148737", "KRONE_199904091757078", "TT_20170825F693694D0D", "FURCHE_20160728E0B6BB12ED", "KLEINE_20161028041003014480122", "NVT_200108010500550020", "MWVOLL_20160616_6998555", "KLEINE_20171027041003028330126", "PRESSE_201506071915360071", "KRONE_20181007810780644", "TT_201602032192BBDFF8", "SN_200005202042540315", "PRESSE_201407161915270070", "SN_201001072322140083", "KURIER_201709210401550151", "NVT_201704192901910B90", "KURIER_201801030401370141", "KURIER_201212131716140053", "KURIER_201010231720010174", "OOEN_20170216040003180050041", "WZ_20151104031308233360064", "KLEINE_20181023041005295000439", "KURIER_200812051719380050", "FORMATDB_200408061527100123", "VN_201802240AED5AC3D8", "SN_20181020013238294520065", "TT_20130518020702184280", "SN_20150928013141286540065", "KURIER_201212130257460028", "NOEN_20180523015522145170020", "KURIER_201511041920580028", "TT_201704072E97932212", "VN_20181009231332702480107", "PRESSE_201708261915150104", "KURIER_201712080401330011", "STANDARD_20160105194002051940118", "STANDARD_200503261922440151", "KURIER_201811210401140268", "PROFIL_201204161209330087", "PROFIL_201405121146010070", "STANDARD_20180411194503066820116", "VN_20160326230734283850135", "OOEN_20180414014503176230089", "STANDARD_20160105194002051940116", "STANDARD_20180517194502325560049", "IM_201606291AC7F29F5E", "TT_20160912860AAB4E91", "PRESSE_201605291915150097", "STANDARD_20181027194502239080115", "SN_200604082314150216", "KURIER_201709210401550037", "OOEN_20181107014504186900074", "PRESSE_201208051917230086", "PRESSE_201410281915260106", "KURIER_201706210401190224", "KURIER_201302090309270097", "KURIER_201302091832000119", "KURIER_201606211920420083", "EMEDIA_20170224FEC67206A4", "KURIER_201805280400460030", "TT_20170825BACDD2A41E", "STANDARD_200611181922050155", "PRESSE_201607301915130110", "WZ_20171201031310660980028", "STANDARD_20150218194001313380101", "STANDARD_19911221050632", "WIENERIN_20170126222697D75E", "BVZ_20140701143421024730097", "WIBLATT_200804170205030184", "IM_20010701_1601310082", "IM_201009291642160043", "TTKOMP_20160701020515166300102", "PRESSE_201507121915360104", "STANDARD_19920205054800", "WZ_200408052235210142", "WZ_20180223031310791180041", "TT_201801186199D4726A", "OOEN_20171030040004016540027", "WZ_20181201_5133365", "PROFIL_201307221531470023", "KRONE_20160603011960099", "PRESSE_201711111915180109", "KURIER_201001031728370081", "WZ_20171108031310271600038", "NEWS_200003092057410012", "PRESSE_1999020603430016", "NEWS_2016040298A7674F75", "KURIER_201702171920390031", "VN_20110514D1BD2E4A00", "KURIER_200804101725380136", "VN_20150801232923258770102", "PRESSE_201306301915390059", "KRONE_20160911104750263", "STANDARD_20150418194005147330050", "STANDARD_20171115194502812820138", "OOEN_200703030408250119", "MWVOLL_20181113_10517373", "SN_20160504013130298600094", "KLEINE_20170406041003109020296", "WZ_20170908031307303630059", "WZ_20101023011627071210051", "VN_20150509231014309580135", "FORMATDB_201401101616150072", "KRONE_201412200435490360", "KRONE_201412191716410139", "TTKOMP_20160226020411097190106", "KLEINE_20150805041003069370055", "KRONE_20160403211040257", "KURIER_201212131716140054", "OOEN_200703030408250066", "PROFIL_201404071031410011", "WIBLATT_200305281924120064", "WZ_200810170115320154", "VN_20160718233308157760116", "STANDARD_19960119054848000020", "WZ_20160331031308108280086", "KURIER_201601121920550012", "OOEN_200809250400290135", "WIENERIN_201712149207825157", "NOEN_20180117074232329640218", "NOEN_20180116080553666750081", "PRESSE_20160803B1018081CD", "STANDARD_200408021921180127", "PROFIL_201807092787C83B57", "SN_20170905013041082640031", "SN_20140402005000266560093", "FORMATDB_201509252E0F8AB8B4", "KURIER_201707061920410025", "VN_20130518230427033460115", "STANDARD_20161221194004221610086", "ECHO_200709011522210012", ], }, { "output_detailed_filename": "Risikotyp_detailed_SM.csv", "output_coarse_filename": "Risikotyp_coarse_SM.csv", "docids": [ "HOR_2018082490C6B90BA6", "KURIER_201802201920530123", "TT_201804180CD23811B4", "KURIER_201611201920430069", "TREND_201612226E11013D1F", "WZ_20161029031111143530003", "STANDARD_20170523194005160300053", "WZ_20151218031309229620086", "TT_201708119645A5DB57", "PRESSE_201610011915210114", "OOEN_20180707014504095450024", "NEWS_201803302D93C6840A", "KURIER_201611161920420006", "HOR_201802026B29B8014D", "PROFIL_20160418C3E30FAB52", "STANDARD_20180221194502454650053", "VN_20161119234212178480026", "NEWS_2016121770EAEEF26F", "MEDIANET_201801191CD52EA209", "EMEDIA_20170630F066D347A6", "WZ_20161124031311072500017", "WZ_20160701031110004350080", "KURIER_201706101920400064", "PRESSE_201206011916560077", "WZ_20171230031309606310024", "FALTER_2017032239A20C3CF6", "STANDARD_20161021194006012290100", "WZ_20150617031308046800089", "STANDARD_20171009194501248060058", "STANDARD_20140930194001232200080", "WZ_20161029031111143530065", "KURIER_201709161920370055", "HOR_20180420D5FA94E99C", "WZ_20180116031310640080065", "PROFIL_201504271515360021", "FALTER_201804043CD17074DB", "WZ_20140214031311088700075", "WZ_20160713031109310430069", "STANDARD_20161126194005245720106", "WZ_20161228031110098750062", "KONSUM_2018122075F5776A7B", "STANDARD_20121217194003077710071", "FALTER_20161027BE722103D4", "NVT_20161227316B6D24F7", "STANDARD_20180612194505516790071", "FURCHE_201709219910618CEC", "NEWS_20161029A3577B7D95", "TT_201704244B424FE8D5", "STANDARD_20180523040502553940131", "TVMEDIA_20161130A193B0E0D0", "KLEINE_20161227041002255230055", "KRONE_20180322204680105", "HOR_20180518B60B469237", "STANDARD_20170323194003312040055", "STANDARD_20160507194005312160081", "WZ_20160102031110045780077", "WZ_20111110031312018380097", "PRESSE_201801291915100054", "HOR_20170929ED5BF5B1F9", "WZ_20170816031306156110056", "STANDARD_20130827194001212180078", "MWVOLL_20161203_7668395", "EMEDIA_20170825E2EFCA16F6", "MWVOLL_20131007_3368714", "SN_20180505013249159810031", "WZ_20141219031315186660080", "KURIER_201805031920330140", "PRESSE_201805281915090038", "FORMATDB_20161014B62E777BAC", "TT_201803290E1143FE33", "WZ_20130731031312148090058", "HOR_201709298002CF28C0", "KURIER_201609231920410017", "KURIER_201610132035450044", "FALTER_201607068366BB6A49", "WZ_20160520031309097640078", "FALTER_20160706C6654FC375", "MWVOLL_20180621_9948695", "KRONE_20180113665350016", "PRESSE_201801131915170068", "WZ_20140927031310082540015", "TT_20161231E97D18764E", "MEDIANET_20160902B0C588AC2A", "AUGUSTIN_20180523BD35D9AADB", "KURIER_201611161920420050", "HOR_201806083E1BC69BF7", "FALTER_201210171627490040", "WZ_20160430031309229040097", "KURIER_201611211920390062", "KURIER_201612291920400008", "KURIER_201611160401360151", "KURIER_201705131920390057", "TT_2018050580FC7DEEE6", "KURIER_201708011920340147", "OOEN_20161124040003143720095", "KURIER_201708021920320126", "FALTER_20170301591854D1A7", "FORMATDB_201404251509320066", "TT_200605110130510059", "KONSUM_201811226DB535AF40", "WZ_20120811031310280150021", "OOEN_20180125040004262250071", "STANDARD_20180407194503450510186", "HOR_201604291219400028", "SBGW_20180329030104250810004", "KURIER_201612211920400031", "PRESSE_201209131916470087", "TREND_201410271119240033", "KURIER_201701051920400069", "PRESSE_201812191915090090", "KURIER_201701250401240048", "STANDARD_20160810194003101160073", "FALTER_201607066AE4DE74BC", "STANDARD_20180420194502107340150", "OOEN_20180623014504177420057", "SN_20180113013228261630040", "MWVOLL_20151010_5965051", "WZ_20180621031308464340089", "HOR_201805116BB5AD4C0E", "WZ_20160406031108092660081", "EMEDIA_2018121434088C1B16", "PRESSE_201708250400130056", "WZ_20170729031307212620093", "KURIER_201704041920380060", "EMEDIA_20181025A5F12F21B0", "SN_20180726013047117830037", "PRESSE_201811091915090081", "MEDIANET_20160520329B003D38", "WZ_20171230031309606310089", "SN_20150625013134095230069", "FALTER_20161221BEEFEA679E", "STANDARD_20160416194005259970135", "WZ_20140327031316211250086", "KURIER_201605090401000029", "PRESSE_201811031915130184", "TREND_201309301540590016", "SN_20170619013047264520053", "WZ_20180303031309561710033", "KURIER_201612171920420009", "OOEN_20180502014504039450045", "OOEN_20171130040004177310091", "MEDIANET_20150626D24AEE7D06", "WZ_20180421226A5B8EF8", "STANDARD_20140205194001180700014", "KURIER_201210090311310024", "OOEN_20150311035959197400062", "EMEDIA_2017063098D7BFB26C", "PROFIL_201703131303200008", "KURIER_201709260401220003", "WZ_20160617031109050310088", "WZ_20170314031110164530078", "KURIER_201807101920500128", "PRESSE_201706191915100015", "STANDARD_20181020194503555290113", "SN_20121114232153184730065", "STANDARD_20180213194502596740055", "MEDIANET_201401144B991FAE58", "WZ_20160827031109196290092", "KLEINE_20170619041003151010191", "WZ_20171018031307015270083", "KLEINE_20180428041005096000409", "WZ_20161116031310289520007", "WZ_20180608031308098960016", "WZ_20121113031323233630015", "KURIER_201706231920420148", "STANDARD_20180217194503792870144", "HOR_2017092225EA50DB01", "STANDARD_20180306194502455760039", "WIENERIN_2017112311A04DB8BA", "KURIER_201706141920400055", "PROFIL_20160613509111033C", "VN_20170619231851153070030", "KURIER_201806261921130145", "KURIER_201712061920340136", "WZ_20140816031315025180060", "PRESSE_201601091915170066", "STANDARD_20180228194502100220091", "TREND_201707288F5D6CABCD", "SN_20180609013300254480032", "WZ_20160930031310160560038", "TT_20180429EA1BDEB143", "MWVOLL_20181112_10516521", "ARBEITW_2015091667B9268003", "PRESSE_201703121915120154", "STANDARD_20180505194503285970092", "WZ_20131029031314195500091", "WZ_20170223031110261430020", "HOR_201505221355140028", "MWVOLL_20180527_9850944", "MWVOLL_20161216_7720033", "WZ_20111115031309007650064", "OOEN_20171019040003108430459", "STANDARD_20181121194502895250091", "PROFIL_201507279F8A2214BC", "WZ_20120204031314062310127", "KURIER_201706220401530297", "KURIER_201606220402160236", "SN_20160921013100182820086", "HOR_20171027BD8378B6BF", "STANDARD_20171122194502350460038", "FALTER_2015120262C41042D2", "KLEINE_20180603041006090110503", "KLEINE_20171004041005036270360", "WZ_20150224031309054110092", "PRESSE_201509191915460085", "HOR_201804063E31662A39", "VN_20170703232342174230113", "FORMATDB_201705123B2C075201", "KRONE_20180112749000008", "WZ_20150617031308046800070", "HOR_2018051818434D63C5", "SN_20180516013053016430080", "WZ_20150522031314256230013", "STANDARD_20130614194003266710096", "MWVOLL_20151221_6271834", "VN_20180324000014095820021", "OOEN_20171129040004132910109", "STANDARD_20160921194004165110114", "STANDARD_201112151925160084", "HOR_201505291614310008", "SN_20180705013104300970047", "PROFIL_20180806F18A6B1C82", "WIBLATT_201205218B11B07398", "NVT_20130825348874E2F3", "HOR_20180112C7D5EFDB5D", "STANDARD_20141112040502235530058", "TT_201604293A9C709A55", "STANDARD_201109291925100060", "HOR_201210121525480015", "PRESSE_201405041915370159", "MWVOLL_20130610_2958740", "PRESSE_201608061915130103", "STANDARD_20130713194001270370094", "HOR_20161125EDE694F8A7", "MEDIANET_2018051854EE2019AB", "NVT_201801314C009E2719", "WZ_20180901_5080836", "EMEDIA_20141128E23D140AB9", "KLEINE_20180131041006324230241", "KURIER_201807191920330152", "PRESSE_201806090050160116", "MEDIANET_20180427EE75BF82BD", "NVT_201712060F00F9454E", "KLEINE_20171206041004267390382", "KURIER_201806021920310102", "OESTERREICH_201705281039445318", "WZ_20111223031310295860064", "TT_20181208BE359ADF61", "STANDARD_200708291921100100", "STANDARD_20171102194502538810063", "KURIER_201803130401050042", "KURIER_201604131921040170", "KURIER_201803131920520035", "KLEINE_20151231041001048910051", "WZ_20141127031310104160016", "KRONE_20161030176580104", "KURIER_201306301834350076", "KURIER_201701250401240118", "KURIER_201611011921000073", "STANDARD_20160924194006198620098", "SN_20140908003124006640046", "OEREICHE_20150917CDE3FEEF88", "SN_20140811003124166830012", "KURIER_201709150402190191", "STANDARD_20171012040504673550028", "HOR_201709222740CBF99B", "HOR_20160415965951E8CD", "HOR_201111111511230025", "FALTER_201008111611360076", "FORMATDB_2018071356A3AACC31", ], }, { "output_detailed_filename": "Risikotyp_detailed_SM_SC.csv", "output_coarse_filename": "Risikotyp_coarse_SM_SC.csv", "docids": [ "NVT_20171014318738E118", "KLEINE_20170810041002041130292", "SN_20160608013118289810045", "STANDARD_20160419194003243230079", "KURIER_201701121920380076", "TTKOMP_20161221020458156710232", "KRONE_20161030176580107", "OOEN_20171216040003158190062", "TT_201805120C8741EEB7", ], }] labels = list(modelRT.nlp.get_pipe("textcat").labels) main.log_manager.info_global("--------------------------------\n" f"Labels: {labels}\n") header = ["docid"] + labels for group in groups: main.log_manager.info_global("--------------------------------\n" f"Starting new group\n") progressbar = progress.bar.Bar( 'Calculating predictions for each text ...', max=len(group['docids']), suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h') group['predictions'] = [] group['labels'] = [] for docid in group['docids']: text = ske_manager.get_doc_from_docid(ske_config, docid)['text'] doc = modelRT.nlp(text) prediction = { "docid": docid, } for cat in doc.cats: prediction[cat] = doc.cats[cat] group['predictions'].append(prediction) progressbar.next() progressbar.finish() main.log_manager.info_global("--------------------------------\n" f"Output CSVs\n") with open(group["output_detailed_filename"], 'w') as f: writer = csv.DictWriter(f, fieldnames=header) writer.writeheader() writer.writerows(group['predictions']) with open(group['output_coarse_filename'], 'w') as f: writer = csv.writer(f, dialect="excel") writer.writerows( [[pred['docid']] + [cat for cat in pred if cat != "docid" and pred[cat] > 0.5] for pred in group['predictions']]) ske_manager.close_session()