def closest_n_texts(residual_texts, n_c=10):
    txtN = textNormalization()
    normal_texts = [' '.join(i) for i in txtN.normalize_texts(residual_texts)]
    vect = TfidfVectorizer(min_df=5)
    tfidf = vect.fit_transform(normal_texts)
    X = tfidf.A
    closest = closest_n_index(X, n_c=n_c)
    representative_texts = []
    for i in closest:
        representative_texts.append(residual_texts[i])
    return representative_texts
	def texts_to_mean_hist(self, texts, method):
		aux_hist = Counter()
		final_hist = {}
		txt_nrm = textNormalization()
		for t in texts:
			aux_hist += txt_nrm.text_to_hist(t)
		aux_hist = dict(aux_hist)
		texts_size = len(texts)
		for k,v in aux_hist.items():
			final_hist[k] = v/texts_size
		return final_hist
	def compare_one_all(self,textA, texts, method):
		txt_nrm = textNormalization()
		results = {}
		id_A, text_A = textA
		histA = txt_nrm.text_to_hist(text_A)
		for id_t, text_t in texts:
			histogram_A_aux = []
			histogram_T_aux = []
			histogram_t = txt_nrm.text_to_hist(text_t)
			for k,v in histA.items():
				if k in histogram_t:
					histogram_A_aux.append(v)
					histogram_T_aux.append(histogram_t[k])
			results[id_t] = method(histogram_A_aux,histogram_T_aux)
		return results
 def __init__(self, list_texts_control, treshold_distance=None):
     self.list_texts_control = list_texts_control
     self.vectorizer = TfidfVectorizer()
     self.txtN = textNormalization()
     self.normal_texts = [
         ' '.join(i)
         for i in self.txtN.normalize_texts(self.list_texts_control)
     ]
     self.vect_fit = self.vectorizer.fit(self.normal_texts)
     self.X_control = self.vect_fit.transform(self.normal_texts).A
     if treshold_distance:
         self.treshold_distance = treshold_distance
     else:
         self.treshold_distance = np.mean(
             distance.cdist(self.X_control, self.X_control, 'cosine'))
示例#5
0
 def __init__(self,
              df_path,
              use_pca=True,
              N=1000,
              min_df=0.1,
              max_df=0.5,
              df_column_text='text',
              df_class_text='class',
              create_y=True):
     self.df = pd.read_csv(df_path)
     self.vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df)
     self.txtN = textNormalization()
     self.create_y = create_y
     self.create_X_y(df_column_text, df_class_text)
     self.use_pca = use_pca
     if use_pca:
         self.create_X_PCA(N=N)
 def __init__(self,
              classifier_method,
              use_pca=True,
              N=10,
              max_df=0.9,
              df_path='texts_manually_classified.csv',
              X_and_y=None):
     self.df = pd.read_csv(df_path)
     self.vectorizer = TfidfVectorizer(max_df=max_df)
     self.txtN = textNormalization()
     if not X_and_y:
         self.create_X_y()
         self.use_pca = use_pca
         if use_pca:
             self.create_X_PCA(N)
     else:
         self.X, self.y = X_and_y
     self.create_clf(classifier_method)
from textNormalization import textNormalization
import sys

sys.path.append(os.path.dirname(os.getcwd()))
from crawlers.common.conexao_local import cursorConexao

if __name__ == '__main__':
    tabela_inicial = sys.argv[1]
    tabela_final = sys.argv[2]
    cursor = cursorConexao()
    tn = textNormalization()
    cursor.execute('SELECT id, texto from %s where texto is not null;' %
                   (tabela_inicial, ))
    dados = cursor.fetchall()
    decisoes = tn.dicionario_invertido_id_texto(dados)
    for k, v in decisoes.items():
        cursor.execute(
            'INSERT INTO %s (palavra,id_texto) values ("%s","%s");' %
            (tabela_final, k, v))