def get_metamatrix(self): preds, preds_type = self.get_probas() str_sents = self.get_sentences_for_lm() probs = get_lm_probas(str_sents) # temp for local windows - file is processed by lm on server separately #self.write_sentences_for_lm(str_sents) #with open('lm_preds_test_articles.json','r',encoding='utf-8') as f: # probs = json.loads(f.read()) # end temp self.metafeats = pd.concat([pd.DataFrame(preds,columns=['present','zero']), pd.DataFrame(preds_type,columns=['a','an','the']), pd.DataFrame(probs,columns=['lm_a','lm_an','lm_the','lm_zero'])], axis=1) probs_ratio = [] probs_delta = [] init_probs = [] corr_probs = [] lm_choice = [] for i in range(self.metafeats.shape[0]): row = self.metafeats.iloc[i] feat_row = self.feats.iloc[i] init_prob = row['lm_'+feat_row['Target']] corr_prob = row['lm_'+feat_row['Predicted']] init_probs.append(init_prob) corr_probs.append(corr_prob) probs_ratio.append(init_prob / corr_prob) probs_delta.append(init_prob - corr_prob) lm_choice.append(np.argmax(row[['lm_a','lm_an','lm_the','lm_zero']]).split('_')[1]) self.metafeats['init_prob'] = init_probs self.metafeats['corr_prob'] = corr_probs self.metafeats['probs_ratio'] = probs_ratio self.metafeats['probs_delta'] = probs_delta self.feats['LM'] = lm_choice #for sent,np,iprob,cprob in zip(self.feats['Sentence'],self.feats['raw_NP'], # self.metafeats['init_prob'],self.metafeats['corr_prob']): # print(sent,np,iprob,cprob) self.metafeats = self.metafeats.loc[(self.feats['Target'] != self.feats['Predicted']) | (self.feats['Target'] != self.feats['LM']),:] with open('../models/article_choice_vectorizer.pickle','rb') as f: art_vect = pickle.load(f) self.metafeats_sparse = hstack((self.metafeats.to_sparse(), art_vect.transform(self.feats.loc[self.metafeats.index,'Target']), art_vect.transform(self.feats.loc[self.metafeats.index,'LM']), art_vect.transform(self.feats.loc[self.metafeats.index,'Predicted']))) print(self.metafeats_sparse.shape)
def lm_decision(sent, initial, suggestions, idx): options = [] for s in suggestions: options.append(sent[:idx] + s + sent[idx + len(initial):]) probs = get_lm_probas('\n'.join(options) + '\n', 'text') return np.argmax(probs)
#if i != len(error_spans): # print(text) # print(i) # print(error_spans) # print(article_corrector.feats[['raw_NP','Start_idx','Sent_start_idx']]) # print('=================') corrector.feats = [] tn += 1 #with open('init_sents_for_'+err.lower()+'.txt','w',encoding='utf-8') as f: # f.write('\n==========\n\n'.join(init_sents)) #with open('tagged_sents_for_'+err.lower()+'.pickle','wb') as f: # pickle.dump(tagged_sents,f) lm_preds = get_lm_probas('\n\n'.join(['\n'.join(x) for x in all_sents]) + '\n', inp_type='text') with open(err.lower() + '_meta.csv', 'w', encoding='utf-8-sig', newline='') as f: csvw = csv.writer(f, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvw.writerow(corrector.logit_bin.classes_.tolist() + corrector.logit_type.classes_.tolist() + [ 'raw_NP', 'Start_idx', 'Sent_start_idx', 'Initial', 'ML_L1', 'Ann' ] + ['lm_' + x for x in options]) for pred, predt, corr, lm_pred in zip(predsp, predst, correct, lm_preds): csvw.writerow(list(pred) + list(predt) + corr + lm_pred)