def closest_n_texts(residual_texts, n_c=10): txtN = textNormalization() normal_texts = [' '.join(i) for i in txtN.normalize_texts(residual_texts)] vect = TfidfVectorizer(min_df=5) tfidf = vect.fit_transform(normal_texts) X = tfidf.A closest = closest_n_index(X, n_c=n_c) representative_texts = [] for i in closest: representative_texts.append(residual_texts[i]) return representative_texts
def texts_to_mean_hist(self, texts, method): aux_hist = Counter() final_hist = {} txt_nrm = textNormalization() for t in texts: aux_hist += txt_nrm.text_to_hist(t) aux_hist = dict(aux_hist) texts_size = len(texts) for k,v in aux_hist.items(): final_hist[k] = v/texts_size return final_hist
def compare_one_all(self,textA, texts, method): txt_nrm = textNormalization() results = {} id_A, text_A = textA histA = txt_nrm.text_to_hist(text_A) for id_t, text_t in texts: histogram_A_aux = [] histogram_T_aux = [] histogram_t = txt_nrm.text_to_hist(text_t) for k,v in histA.items(): if k in histogram_t: histogram_A_aux.append(v) histogram_T_aux.append(histogram_t[k]) results[id_t] = method(histogram_A_aux,histogram_T_aux) return results
def __init__(self, list_texts_control, treshold_distance=None): self.list_texts_control = list_texts_control self.vectorizer = TfidfVectorizer() self.txtN = textNormalization() self.normal_texts = [ ' '.join(i) for i in self.txtN.normalize_texts(self.list_texts_control) ] self.vect_fit = self.vectorizer.fit(self.normal_texts) self.X_control = self.vect_fit.transform(self.normal_texts).A if treshold_distance: self.treshold_distance = treshold_distance else: self.treshold_distance = np.mean( distance.cdist(self.X_control, self.X_control, 'cosine'))
def __init__(self, df_path, use_pca=True, N=1000, min_df=0.1, max_df=0.5, df_column_text='text', df_class_text='class', create_y=True): self.df = pd.read_csv(df_path) self.vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df) self.txtN = textNormalization() self.create_y = create_y self.create_X_y(df_column_text, df_class_text) self.use_pca = use_pca if use_pca: self.create_X_PCA(N=N)
def __init__(self, classifier_method, use_pca=True, N=10, max_df=0.9, df_path='texts_manually_classified.csv', X_and_y=None): self.df = pd.read_csv(df_path) self.vectorizer = TfidfVectorizer(max_df=max_df) self.txtN = textNormalization() if not X_and_y: self.create_X_y() self.use_pca = use_pca if use_pca: self.create_X_PCA(N) else: self.X, self.y = X_and_y self.create_clf(classifier_method)
from textNormalization import textNormalization import sys sys.path.append(os.path.dirname(os.getcwd())) from crawlers.common.conexao_local import cursorConexao if __name__ == '__main__': tabela_inicial = sys.argv[1] tabela_final = sys.argv[2] cursor = cursorConexao() tn = textNormalization() cursor.execute('SELECT id, texto from %s where texto is not null;' % (tabela_inicial, )) dados = cursor.fetchall() decisoes = tn.dicionario_invertido_id_texto(dados) for k, v in decisoes.items(): cursor.execute( 'INSERT INTO %s (palavra,id_texto) values ("%s","%s");' % (tabela_final, k, v))