def create_converters(r, in_size): """ :param r: :param in_size: :return: """ if in_size != -1: pca_model = pickle.load( open("models/pca_" + r + "_" + str(in_size) + ".p", 'r')) else: pca_model = None # end if # >> 1. Choose a text to symbol converter. if r == "pos": converter = RCNLPPosConverter(resize=-1, pca_model=pca_model, fill_in=True) elif r == "tag": converter = RCNLPTagConverter(resize=-1, pca_model=pca_model, fill_in=True) elif r == "fw": converter = RCNLPFuncWordConverter(resize=-1, pca_model=pca_model, fill_in=True) else: converter = RCNLPWordVectorConverter(resize=-1, pca_model=pca_model) # end if return converter
# For each size for in_size in reps[r]: print("For representation %s size %d" % (r, in_size)) if in_size != -1: pca_model = pickle.load(open("models/pca_" + r + "_" + str(in_size) + ".p", 'r')) else: pca_model = None # end if # >> 1. Choose a text to symbol converter. if r == "pos": converter = RCNLPPosConverter(resize=-1, pca_model=pca_model) elif r == "tag": converter = RCNLPTagConverter(resize=-1, pca_model=pca_model) elif r == "fw": converter = RCNLPFuncWordConverter(resize=-1, pca_model=pca_model) elif r == "letter": converter = LetterConverter(resize=-1, pca_model=pca_model) else: converter = RCNLPWordVectorConverter(resize=-1, pca_model=pca_model) # end if # >> 3. Array for results average_success_rate = np.array([]) # For each samples for s in range(0, args.samples): print("#") # >> 5. Prepare training and test set. training_set_indexes = np.arange(0, 100, 1)[s:s+args.training_size] test_set_indexes = np.delete(np.arange(0, 100, 1), training_set_indexes, axis=0)[:args.test_size]
# PCA model pca_model = None if args.pca_model != "": pca_model = pickle.load(open(args.pca_model, 'r')) # end if # >> 1. Choose a text to symbol converter. if args.converter == "pos": converter = RCNLPPosConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "tag": converter = RCNLPTagConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "fw": converter = RCNLPFuncWordConverter(resize=args.in_components, pca_model=pca_model) else: converter = RCNLPWordVectorConverter(resize=args.in_components, pca_model=pca_model) # end if # >> 3. Array for results doc_success_rate_avg = np.array([]) sen_success_rate_avg = np.array([]) doc_success_rate_std = np.array([]) sen_success_rate_std = np.array([]) # Training set sizes training_set_sizes = np.arange(1, 96, args.step) # For each training size
logging = RCNLPLogging(exp_name=ex_name, exp_inst=ex_instance, exp_value=RCNLPLogging.generate_experience_name( locals())) logging.save_globals() logging.save_variables(locals()) # Reduce POS pca_reduction(RCNLPPosConverter()(io.open(args.text, 'r').read()), title="POS", ncomponents=args.poscomponents) # Reduce Tags pca_reduction(RCNLPTagConverter()(io.open(args.text, 'r').read()), title="Tags", ncomponents=args.tagcomponents) # Reduce Word vectors pca_reduction(RCNLPWordVectorConverter()(io.open(args.text, 'r').read()), title="Word vectors", ncomponents=args.wvcomponents) # Reduce FW pca_reduction(RCNLPFuncWordConverter()(io.open(args.text, 'r').read()), title="Function words", ncomponents=args.fwcomponents) # Open logging dir logging.open_dir() # end if
def clustering_states(args, texts1, texts2, ex_name, ex_instance, size, input_scaling, leak_rate, spectral_radius, input_sparsity, w_sparsity, logging, save_graph=False, pca_model=None, flow=None): # >> 1. Convert the text to symbolic or continuous representations if args.converter == "pos": converter = RCNLPPosConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "tag": converter = RCNLPTagConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "fw": converter = RCNLPFuncWordConverter(resize=args.in_components, pca_model=pca_model) else: converter = RCNLPWordVectorConverter(resize=args.in_components, pca_model=pca_model) # end if # >> 2. Create an echo state network if flow is None: flow = create_reservoir(converter.get_n_inputs(), size, input_scaling, leak_rate, spectral_radius, input_sparsity, w_sparsity) # end if # >> 3. Generate Temporal Representations # Generate "temporal representations" for first author a1_states, a1_index, a1_n_samples = generate_documents_states( converter, flow, texts1, args) if save_graph: generate_plot(logging, ex_name, ex_instance, "Temporal representations for Author 1", "Time", "Neurons", a1_states[:args.show_states], transpose=True, cmap='Greys') # end if # Generate "temporal representations" for second author a2_states, a2_index, a2_n_samples = generate_documents_states( converter, flow, texts2, args) if save_graph: generate_plot(logging, ex_name, ex_instance, "Temporal representations for Author 2", "Time", "Neurons", a2_states[:args.show_states], transpose=True, cmap='Greys') # end if # >> 4. Complete states. complete_states = np.vstack((a1_states, a2_states)) if save_graph: generate_plot(logging, ex_name, ex_instance, "Complete joined Reservoir states", "Time", "Neurons", complete_states, transpose=True, cmap='Greys') # end if # Get average and std dev logging.save_results("Average neural activations", np.average(complete_states)) logging.save_results("Std dev of neural activations", np.std(complete_states)) # Same size for each authors in needed if args.homogene: if a1_states.shape[0] > a2_states.shape[0]: a1_states = a1_states[:a2_states.shape[0]] elif a2_states.shape[0] > a1_states.shape[0]: a2_states = a2_states[:a1_states.shape[0]] # end if # end if # >> 5. Join states. join_states = np.vstack((a1_states, a2_states)) if save_graph: generate_plot(logging, ex_name, ex_instance, "Joined Reservoir states", "Time", "Neurons", join_states, transpose=True, cmap='Greys') # end if # >> 6. Clustering if args.out_components != -1: # PCA pca = PCA(n_components=args.out_components) pca.fit(join_states) # Generate PCA image of principal components if args.pca_images and save_graph: # Generate PCA a1_states_pca = pca.transform(a1_states) a2_states_pca = pca.transform(a2_states) for c in np.arange(0, 8): save_pca_image(logging, a1_states_pca, a2_states_pca, c, c + 1) # end for # end if # Reduce whole states join_states_reduced = pca.transform(join_states) # Get centroids for the whole components centroids, _ = kmeans(join_states_reduced, 2) # Assign each sample to a cluster idx, _ = vq(pca.transform(complete_states), centroids) a1_idx = idx[:a1_n_samples] a2_idx = idx[a1_n_samples:] else: # Get centroids for the whole components centroids, _ = kmeans(join_states, 2) # Assign each sample to a cluster idx, _ = vq(complete_states, centroids) a1_idx = idx[:a1_n_samples] a2_idx = idx[a1_n_samples:] # end if # Compute average precision return get_v_measure_score(a1_idx, a2_idx, a1_index, a2_index)
if __name__ == "__main__": # Argument parser parser = argparse.ArgumentParser( description= "RCNLP - Authorship attribution with Part-Of-Speech to Echo State Network" ) # Argument parser.add_argument("--file", type=str, help="Input text file") parser.add_argument("--lang", type=str, help="Language (ar, en, es, pt)", default='en') parser.add_argument("--sample-size", type=int, help="Word vector sample size", default=300) args = parser.parse_args() # Convert the text to Temporal Vector Representation converter = RCNLPFuncWordConverter() doc_array = converter(io.open(args.file, 'r').read()) # Display the Temporal Vector Representation RCNLPFuncWordConverter.display_representations(doc_array) # Transform the TVR to ESN learning input data_set = RCNLPFuncWordConverter.generate_data_set_inputs(doc_array, 2, 0) # end if