def index_page(html_data): """ Indexes a single HTML page. """ soup = BeautifulSoup(html_data, features='html.parser') # Remove scripts and styles for script in soup(["script", "style"]): script.extract() # Get only text from page and preprocess it text = soup.get_text(separator=' ') tokens = preprocessing.preprocess(text) raw_tokens = preprocessing.preprocess(text, keep_stop_words=True) # Unique words word_list = set(tokens) # Get frequencies and indices frequencies = {word: 0 for word in word_list} indexes = {word: [] for word in word_list} for i, word in enumerate(raw_tokens): if word in word_list: frequencies[word] += 1 indexes[word].append(i) return word_list, frequencies, indexes
def process_results(results: list, html_path): out = [] for res in results: pos = res[1] document = res[2] filepath = html_path / document with filepath.open() as file: html_data = file.read() soup = BeautifulSoup(html_data, features='html.parser') # Remove scripts and styles for script in soup(["script", "style"]): script.extract() # Get only text from page and preprocess it text = soup.get_text(separator=' ') tokens = preprocessing.preprocess(text, raw=True, keep_stop_words=True) # print(len(tokens), len(word_tokenize(text))) document_length = len(tokens) outtake = list(group_runs(list(sorted(set(chain.from_iterable([list(range(max(0, x-3), min(x+4, document_length))) for x in pos[:100]])))))) snippet = make_snippet(tokens, outtake) # print(pos) # print(outtake) # print(snippet[:250]) out.append((res[0], res[2], snippet[:250])) return out
def option_percent(window_size, training_file, training_labels, num_examples, percent): #get the generator for features and labels generator = preprocessing.preprocess(training_file, training_labels, window_size) features = [] labels = [] for _ in range(num_examples): curr = next(generator) #need to convert all to int curr_features = curr[0] curr_features = list(map(int, curr_features)) features.append(curr_features) labels.append(int(curr[1])) #need lists as numpy arrays to feed into tensor features = np.asarray(features) labels = np.asarray(labels) #partition data into training and testing sets X_train, X_test, Y_train, Y_test = sk.train_test_split(features, labels, test_size=percent, random_state=42)
def __init__(self, tweet): self.tweet = tweet decodedText = self.tweet.text.encode('ascii', 'ignore').decode('utf-8') # Calculate sentiment self.processedText = preprocess(decodedText) self.sentiment = getSentiment(self.processedText)
def map_sentiment_value(post): if 'caption' in post: caption = post['caption'] preprocessed_text = preprocess(caption) result = getSentiment(preprocessed_text) post['sentiment'] = result post['sentiment_compound'] = result['compound'] else: post['sentiment'] = "" post['sentiment_compound'] = 0 return post
def main(raw_query, limit): results_considered = int(limit) start_time = time.time() # preprocess the query query = preprocessing.preprocess(raw_query) results = sqlite_search(query) time_elapsed = time.time() - start_time to_process = results[:min(results_considered, len(results))] html_path = Path(__file__).parent / 'data' output = process_results(to_process, html_path) return output_string(raw_query, time_elapsed, output)
def main(raw_query, limit): dirname = Path(__file__).parent data_dir = dirname / 'data' results_considered = int(limit) start_time = time.time() # preprocess the query query = preprocessing.preprocess(raw_query) results = run_search(data_dir, query) time_elapsed = time.time() - start_time to_process = results[:min(results_considered, len(results))] output = process_results(to_process, data_dir) return output_string(raw_query, time_elapsed, output)
def option_save(window_size, training_file, training_labels, num_examples, dest_file): #get the generator for features and labels generator = preprocessing.preprocess(training_file, training_labels, window_size) features = [] labels = [] for _ in range(num_examples): curr = next(generator) #need to convert all to int curr_features = curr[0] curr_features = list(map(int, curr_features)) features.append(curr_features) labels.append(int(curr[1])) #need lists as numpy arrays to feed into tensor features = np.asarray(features) labels = np.asarray(labels) #train a model and save it to dest_file train.trainsave(num_examples, training_file, training_labels, dest_file) return (0)
def evaluate_mp(window_size, input_file, label_file, num_examples, in_file): data_size = window_size*window_size # tf Graph input x = tf.placeholder("float", [None, (data_size)]) #inputs y_ = tf.placeholder("float", [None, CLASSES]) #ground-truth labels #make sure that topology setup will work layer_1_nodes = data_size layer_2_nodes = data_size assert data_size % LAYER_1_SUBGRAPHS == 0 assert layer_1_nodes % LAYER_1_SUBGRAPHS == 0 assert layer_2_nodes % LAYER_2_SUBGRAPHS == 0 assert CLASSES % LAYER_2_SUBGRAPHS == 0 #create variables to store weights and biases #h1, h2, b1, and b2 contain lists of variables to be used in the subconnected # layers #h1 and b1 create variables that each correspond to one of the subgraphs of # layer 1. There should be (LAYER_1_SUBGRAPHS) different variables created # in each. Each variable should be named "h1_[#]" or "b1_[#]", where "#" # is the variable number #h2 and b2 are the same as h1 and b1 except that they apply to the second # subconnected layer #the out variables control the input into the fully-connected final layer # and are named "out_weights" and "out_biases" #NOTE: THE NAMES ARE NECESSARY TO SAVE THE MODEL TO A FILE weights = { 'h1': [tf.Variable(tf.random_normal([int(data_size/LAYER_1_SUBGRAPHS), int(layer_1_nodes/LAYER_1_SUBGRAPHS)]), name=("h1_"+str(s))) for s in range(0, LAYER_1_SUBGRAPHS)], 'h2': [tf.Variable(tf.random_normal([int(layer_1_nodes/LAYER_2_SUBGRAPHS), int(layer_2_nodes/LAYER_2_SUBGRAPHS)]), name=("h2_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS)], 'out': tf.Variable(tf.random_normal([int(layer_2_nodes), int(CLASSES)]), name= "out_weights") } biases = { 'b1': [tf.Variable(tf.random_normal([int(layer_1_nodes/LAYER_1_SUBGRAPHS)]), name=("b1_"+str(s))) for s in range(0, LAYER_1_SUBGRAPHS)], 'b2': [tf.Variable(tf.random_normal([int(layer_2_nodes/LAYER_2_SUBGRAPHS)]), name=("b2_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS)], 'out': tf.Variable(tf.random_normal([int(CLASSES)]), name="out_biases") } #add variables to collection and initialize the saver for s in range(0, LAYER_1_SUBGRAPHS): tf.add_to_collection('vars', ("h1_"+str(s))) tf.add_to_collection('vars', ("b1_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS): tf.add_to_collection('vars', ("h2_"+str(s))) tf.add_to_collection('vars', ("b2_"+str(s))) tf.add_to_collection('vars', "out_weights") tf.add_to_collection('vars', "out_biases") saver = tf.train.Saver() # Construct model y = multilayer_perceptron(x, weights, biases) #y contains the predicted outputs #which will be compared to the #ground-truth, y_ # Define loss and optimizer cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)) optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(cost) # Initializing the variables init = tf.global_variables_initializer() #get generator for features and labels generator = preprocessing.preprocess(input_file, label_file, window_size) features = [] labels = [] for count, curr in enumerate(generator): if count >= num_examples: break curr_features = curr[0] curr_features = list(map(float, curr_features)) curr_labels = curr[1] curr_labels = list(map(float, curr_labels)) features.append(curr_features) labels.append(curr_labels) features = np.asarray(features) labels = np.asarray(labels) with tf.Session() as sess: #load the data from in_file loader = tf.train.import_meta_graph(in_file) loader.restore(sess, tf.train.latest_checkpoint('./')) total_error = sess.run([cost], feed_dict={x:features, y_:labels})[0] print("The test error was", (total_error/num_examples))
def train_mp(window_size, input_file, label_file, num_examples, out_file): data_size = window_size*window_size # tf Graph input x = tf.placeholder("float", [None, (data_size)]) #inputs y_ = tf.placeholder("float", [None, CLASSES]) #ground-truth labels #make sure that topology setup will work layer_1_nodes = data_size layer_2_nodes = data_size assert data_size % LAYER_1_SUBGRAPHS == 0 assert layer_1_nodes % LAYER_1_SUBGRAPHS == 0 assert layer_2_nodes % LAYER_2_SUBGRAPHS == 0 assert CLASSES % LAYER_2_SUBGRAPHS == 0 #create variables to store weights and biases #h1, h2, b1, and b2 contain lists of variables to be used in the subconnected # layers #h1 and b1 create variables that each correspond to one of the subgraphs of # layer 1. There should be (LAYER_1_SUBGRAPHS) different variables created # in each. Each variable should be named "h1_[#]" or "b1_[#]", where "#" # is the variable number #h2 and b2 are the same as h1 and b1 except that they apply to the second # subconnected layer #the out variables control the input into the fully-connected final layer # and are named "out_weights" and "out_biases" #NOTE: THE NAMES ARE NECESSARY TO SAVE THE MODEL TO A FILE weights = { 'h1': [tf.Variable(tf.random_normal([int(data_size/LAYER_1_SUBGRAPHS), int(layer_1_nodes/LAYER_1_SUBGRAPHS)]), name=("h1_"+str(s))) for s in range(0, LAYER_1_SUBGRAPHS)], 'h2': [tf.Variable(tf.random_normal([int(layer_1_nodes/LAYER_2_SUBGRAPHS), int(layer_2_nodes/LAYER_2_SUBGRAPHS)]), name=("h2_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS)], 'out': tf.Variable(tf.random_normal([int(layer_2_nodes), int(CLASSES)]), name= "out_weights") } biases = { 'b1': [tf.Variable(tf.random_normal([int(layer_1_nodes/LAYER_1_SUBGRAPHS)]), name=("b1_"+str(s))) for s in range(0, LAYER_1_SUBGRAPHS)], 'b2': [tf.Variable(tf.random_normal([int(layer_2_nodes/LAYER_2_SUBGRAPHS)]), name=("b2_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS)], 'out': tf.Variable(tf.random_normal([int(CLASSES)]), name="out_biases") } #add variables to collection and initialize the saver for s in range(0, LAYER_1_SUBGRAPHS): tf.add_to_collection('vars', ("h1_"+str(s))) tf.add_to_collection('vars', ("b1_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS): tf.add_to_collection('vars', ("h2_"+str(s))) tf.add_to_collection('vars', ("b2_"+str(s))) tf.add_to_collection('vars', "out_weights") tf.add_to_collection('vars', "out_biases") saver = tf.train.Saver() # Construct model y = multilayer_perceptron(x, weights, biases) #y contains the predicted outputs #which will be compared to the #ground-truth, y_ # Define loss and optimizer cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)) optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(cost) # Initializing the variables init = tf.global_variables_initializer() #get the generator for features and labels generator = preprocessing.preprocess(input_file, label_file, window_size) features = [] labels = [] for count, curr in enumerate(generator): if count >= num_examples: break curr_features = curr[0] curr_features = list(map(float, curr_features)) curr_labels = curr[1] curr_labels = list(map(float, curr_labels)) features.append(curr_features) labels.append(curr_labels) features = np.asarray(features) labels = np.asarray(labels) # Launch the graph with tf.Session() as sess: sess.run(init) # Training cycle for epoch in range(ITERATIONS): '''avg_cost = 0.''' #removed from example code to simplify total_batch = int(num_examples/BATCH_SIZE) # Loop over all batches for i in range(total_batch): # Run optimization op (backprop) and cost op (to get loss value) sess.run([optimizer, cost], feed_dict={x: features, y_: labels}) #removed avg_cost tracking for simplicity '''# Compute average loss avg_cost += int(c / total_batch)''' #c was collected from sess.run #removed this section from the example code for simplicity '''# Display logs per epoch step if epoch % display_step == 0: print("Epoch:", '%04d' % (epoch+1), "cost=", \ "{:.9f}".format(avg_cost))''' print("Optimization Finished!") #print training accuracy curr_loss = sess.run([cost], feed_dict={x:features, y_:labels})[0] print("The training error was", (curr_loss/num_examples)) #output to out_file saver.save(sess, out_file)
# ----------------------------------------------------------------------------- # # Utils # # ----------------------------------------------------------------------------- # @app.route('/uploaded/<filename>') # def uploaded_file(filename): # return send_from_directory(app.config['UPLOAD_FOLDER'], filename) # ----------------------------------------------------------------------------- # # Main # # ----------------------------------------------------------------------------- if __name__ == '__main__': import preprocessing.preprocessing as preprocessing import sys if len(sys.argv) > 1 and sys.argv[1] == "collectstatic": preprocessing._collect_static(app) if 'USE_S3' in app.config: flask_s3.create_all(app) else: # render ccss, coffeescript and shpaml in 'templates' and 'static' dirs preprocessing.preprocess(app, request) # set FileSystemCache instead of Memcache for development # cache = werkzeug.contrib.cache.FileSystemCache(os.path.join(app.root_path, "cache")) # run application app.run() # EOF
def test_parsed_header(self): expected_columns = ['_id_fact', '_index_fact', '_score_fact', '_type_fact', 'cprojectID', 'documentID', 'identifiers', 'post', 'prefix', 'term', '_id_meta', '_index_meta', '_score_meta', '_type_meta', 'abstractText', 'affiliation', 'authorIdList', 'authorList', 'authorString', 'chemicalList', 'citedByCount', 'commentCorrectionList', 'dateOfCompletion', 'dateOfCreation', 'dateOfRevision', 'dbCrossReferenceList', 'doi', 'electronicPublicationDate', 'embargoDate', 'epmcAuthMan', 'firstPublicationDate', 'fullTextUrlList', 'grantsList', 'hasBook', 'hasDbCrossReferences', 'hasLabsLinks', 'hasPDF', 'hasReferences', 'hasSuppl', 'hasTMAccessionNumbers', 'hasTextMinedTerms', 'id', 'inEPMC', 'inPMC', 'investigatorList', 'isOpenAccess', 'journalInfo', 'keywordList', 'language', 'license', 'luceneScore', 'meshHeadingList', 'pageInfo', 'pmcid', 'pmid', 'pubModel', 'pubTypeList', 'pubYear', 'source', 'subsetList', 'title', 'tmAccessionTypeList', 'sourcedict'] testdf = preprocessing.preprocess(rawfactspath, rawmetadatapath) columns = list(testdf.columns.values) self.assertCountEqual(columns, expected_columns, "parsed columns unequal")
""" #individual_list = list that contains all gene(individual) expression values #individual_id_list = contains all UNIQUE GENE ID that is used for calculating BHI #individual_ref_id_list = contains all GENE reference id that is used for calculating PPI Interaction Score #all_datamatrix, all_normalized_data_matrix = preprocessed the gene expression values that is fed to FCM #no_of_annotated_cluster: Number of annotated cluster that we get from DAVID tool #annotated_cluster_list: Basically it is a list of list that contains all gene id which are belongs to annotated cluster #annotated_gene_list: Set of all annotated gene id #annotated_gene_cooccurance_matrix(n X n where n = no of total gene(individual)) : represent if any two gene are belong to same annotated cluster or not #unique_protein_ref: Contains unique Gene refernce ID #interaction_score_matrix(n X n where n = number of unique protein reference): contains the interaction score of any interaction of two proteins """ individual_list = [] individual_length, individual_list, individual_id_list, individual_ref_id_list = preprocess( '<path_to_input_dataset>') No_of_genes = len(individual_list) all_data_matrix, all_normalized_data_matrix = preprocess_fcm_datamatrix( individual_length, individual_list) gene_id_list = tuple(open('Intermediate_Data/population_id.txt', 'r')) no_of_annotated_cluster, annotated_cluster_list, annotated_gene_list, annotated_gene_cooccurance_matrix = AnnotatedClustering( individual_id_list) unique_protein_ref, interaction_score_matrix = Confidence_Score_Matrix() """ Input from user """ chromosome_number = int( sys.argv[1] ) # Enter the number of chromosome(individual) you want to generate generation_number = int(sys.argv[2]) # Enter the maximum number of generation
def transform(self, X, y=None, **fit_params): return [preprocessing.preprocess(d) for d in X]
def print_generation(population, generation_num): print("Generation:- {}".format(generation_num)) """ #individual_list = list that contains all gene(individual) expression values #individual_id_list = contains all UNIQUE GENE ID that is used for calculating BHI #individual_ref_id_list = contains all GENE reference id that is used for calculating PPI Interaction Score #all_datamatrix, all_normalized_data_matrix = preprocessed the gene expression values that is fed to FCM #unique_protein_ref: Contains unique Gene refernce ID """ individual_list = [] individual_length, individual_list, individual_id_list, individual_ref_id_list = preprocess( 'Input_data/preprocessed_ILD.txt') all_data_matrix, all_normalized_data_matrix = preprocess_fcm_datamatrix( individual_length, individual_list) """ Input from user """ chromosome_number = int( sys.argv[1] ) # Enter the number of chromosome(individual) you want to generate generation_number = int(sys.argv[2]) # Enter the maximum number of generation zdt_definitions = ZDT3Definitions(individual_list, individual_id_list, individual_ref_id_list) plotter = Plotter(zdt_definitions, individual_list) problem = ZDT(zdt_definitions, all_normalized_data_matrix, chromosome_number) evolution = Evolution(problem, generation_number, chromosome_number,
X_train = pd.read_csv(f"{CACHE_FOLDER}/X_train_preprocessed.csv", index_col="Unnamed: 0") Y_train = pd.read_csv(f"{CACHE_FOLDER}/Y_train_preprocessed.csv", index_col="Unnamed: 0") # If the dataset is not found except: print("\t-> File not found, generating preprocessed datasets") # Load normal datasets X_train = pd.read_csv(f"{DATASET_FOLDER}/X_train_update.csv", index_col="Unnamed: 0") Y_train = pd.read_csv(f"{DATASET_FOLDER}/Y_train_CVw08PX.csv", index_col="Unnamed: 0") # preprocess datasets X_train, Y_train = preprocess(X_train, Y_train) # save preprocessed datasets X_train.to_csv(f"{CACHE_FOLDER}/X_train_preprocessed.csv") Y_train.to_csv(f"{CACHE_FOLDER}/Y_train_preprocessed.csv") print("\t-> Done\n") ############################# # STEP 2: sentences embedding ############################# print("STEP 2: Preparing data for training...") train_x, valid_x, train_y, valid_y = get_datasets_for_training( X_train['designation'], Y_train['prdtypecode'], "tfidf")
def test_mp(window_size, input_file, label_file, num_examples, in_file, layers, nodes, subgraphs, classes, iterations, batch_size, training_rate): #cast variables to correct types window_size = int(window_size) num_examples = int(num_examples) layers = int(layers) #git rid of any spaces in nodes and subgraphs so they cast correctly nodes = literal_eval(str(nodes).replace(' ', '')) subgraphs = literal_eval(str(subgraphs).replace(' ', '')) classes = int(classes) iterations = int(iterations) batch_size = int(batch_size) training_rate = float(training_rate) #make sure length of lists is correct assert (layers == len(nodes)) assert (layers == len(subgraphs)) #define nodes[0] as the data_size and subgraphs[0] as 1 data_size = window_size * window_size nodes = [data_size] + nodes subgraphs = [1] + subgraphs #make sure that topology setup will work #check up to layers-1, the highest index for i in range(1, layers): assert (nodes[i - 1] % subgraphs[i] == 0) assert (nodes[i] % subgraphs[i] == 0) assert (classes % subgraphs[layers] == 0) data_size = window_size * window_size # tf Graph input x = tf.placeholder("float", [None, (data_size)]) #inputs y_ = tf.placeholder("float", [None, classes]) #ground-truth labels #create variables to store weights and biases #create an h in weights and a b in biases for each layer in the model #h1 and b1 create create variables that each correspond to one of the subgraphs of # layer 1. There should be (subgraphs[1]) different subvariables created # in each. Each subvariable should be named "h1_[#]" or "b1_[#]", where "#" # is the subvariable number #h2 and b2 are the same as h1 and b1 except that they apply to the second # subconnected layer, as are h3 and b3 for the third and so on #the out variables control the input into the fully-connected final layer # and are named "out_weights" and "out_biases" #NOTE: THE NAMES ARE NECESSARY TO SAVE THE MODEL TO A FILE #start by initializing weights and biases with the out variables weights = { 'out': tf.Variable(tf.random_normal([int(nodes[layers]), int(classes)]), name="out_weights") } biases = { 'out': tf.Variable(tf.random_normal([int(classes)]), name="out_biases") } #add in the h and b variables for each hidden layer #note: you are creating subgraphs[i] subvariables in both wieghts and biases and #each of these subvariables is an array of length (nodes[i-1]/subgraphs[i]) which #stores a connection for that subgraph. #the s in range(0, subgraphs[i]) is creating multiple subvariables inside of each #weights[weights_name] or biases[biases_name] #for documentation on creating each of these subvariables, see # https://www.tensorflow.org/api_docs/python/tf/random_normal for i in range(1, layers + 1): weights_name = "h" + str(i) biases_name = "b" + str(i) weights[weights_name] = [ tf.Variable(tf.random_normal([ int((nodes[i - 1]) / subgraphs[i]), int(nodes[i] / subgraphs[i]) ]), name=(weights_name + "_" + str(s))) for s in range(0, subgraphs[i]) ] biases[biases_name] = [ tf.Variable(tf.random_normal([int((nodes[i]) / subgraphs[i])]), name=(biases_name + "_" + str(s))) for s in range(0, subgraphs[i]) ] #add variables to collection and initialize the saver #for each layer, add all of the subvariables for i in range(1, layers + 1): weights_name = "h" + str(i) + "_" biases_name = "b" + str(i) + "_" for s in range(subgraphs[i]): subweight_name = weights_name + str(s) #each should be "h(i)_(s)" subbias_name = biases_name + str(s) #each should be "b(i)_(s)" tf.add_to_collection('vars', subweight_name) tf.add_to_collection('vars', subbias_name) #add the out variables tf.add_to_collection('vars', "out_weights") tf.add_to_collection('vars', "out_biases") #initialize saver saver = tf.train.Saver() # Construct model y = multilayer_perceptron(x, layers, weights, biases, subgraphs) #y contains the predicted outputs #which will be compared to the #ground-truth, y_ # Define loss and optimizer cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)) optimizer = tf.train.AdamOptimizer( learning_rate=training_rate).minimize(cost) #get the generator for features and labels generator = preprocessing.preprocess(input_file, label_file, window_size) features = [] labels = [] for count, curr in enumerate(generator): if count >= num_examples: break curr_features = curr[0] curr_features = list(map(float, curr_features)) curr_labels = curr[1] curr_labels = list(map(float, curr_labels)) features.append(curr_features) labels.append(curr_labels) features = np.asarray(features) labels = np.asarray(labels) # Launch the graph with tf.Session() as sess: #load the data from in_file #NOTE: IF RUNNING ON A DIFFERENT MACHINE THAN IT WAS TRAINED ON, ADD #"clear_devices=true" into the arguments for import_meta_graph loader = tf.train.import_meta_graph(in_file) loader.restore(sess, tf.train.latest_checkpoint('./')) total_error = sess.run([cost], feed_dict={x: features, y_: labels})[0] print("The test error was", (total_error / num_examples))
def process(dataset_name, out_folder, train_size, one_document_per_folder, force, args, concat_train_instances, shuffle, preprocess): out_folder = os.path.join(out_folder, dataset_name) if not force and os.path.isdir(out_folder): print('Outfolder existing! Aborting ({})'.format(out_folder)) sys.exit(1) X, Y = dataset_helper.get_dataset(dataset_name) print('#Docs: {}'.format(len(X))) if preprocess: X = [preprocessing.preprocess(x) for x in X] if shuffle: data_train_X, data_test_X, data_train_Y, data_test_Y = dataset_helper.split_dataset(X, Y, train_size=train_size) else: data_train_X, data_test_X, data_train_Y, data_test_Y = X, [], Y, [] if train_size == 1.0: sets = [ ('all', data_train_X, data_train_Y) ] else: sets = [ ('train', data_train_X, data_train_Y), ('test', data_test_X, data_test_Y) ] # Create folder os.makedirs(out_folder, exist_ok=True) all_topic_counts = defaultdict(int) for set_name, X, Y in sets: topic_id_counters = defaultdict(int) set_folder = os.path.join(out_folder, set_name) assert len(X) == len(Y) for x, y in zip(X, Y): # Create set folder if not one_document_per_folder if one_document_per_folder: folder = set_folder else: folder = os.path.join(set_folder, str(y)) os.makedirs(folder, exist_ok=True) doc_id = str(topic_id_counters[y]).zfill(4) if concat_train_instances and set_name == 'train': filename = '{}/{}/{}.txt'.format(folder, y, doc_id) elif one_document_per_folder: filename = '{}/{}_{}/{}.txt'.format(folder, y, doc_id, '0') os.makedirs(os.path.join(*filename.split('/')[:-1]), exist_ok=True) with codecs.open(filename, 'w') as f: f.write(x) all_topic_counts[y] += 1 topic_id_counters[y] += 1 with open(os.path.join(out_folder, 'stats.json'), 'w') as f: json.dump({ 'total_docs': sum(all_topic_counts.values()), 'categories': list(set(Y)), 'topic_counts': all_topic_counts, 'set_counts': {name: len(X) for name, X, Y in sets}, 'params': args, 'timestamp': time_utils.get_time_formatted(), 'unix_timestamp': time_utils.get_timestamp(), 'git_commit': str(git_utils.get_current_commit()) }, f, indent=4, sort_keys=True)
# ----------------------------------------------------------------------------- # # Utils # # ----------------------------------------------------------------------------- # @app.route('/uploaded/<filename>') # def uploaded_file(filename): # return send_from_directory(app.config['UPLOAD_FOLDER'], filename) # ----------------------------------------------------------------------------- # # Main # # ----------------------------------------------------------------------------- if __name__ == '__main__': import preprocessing.preprocessing as preprocessing import sys if len(sys.argv) > 1 and sys.argv[1] == "collectstatic": preprocessing._collect_static(app) if app.config['USE_S3']: flask_s3.create_all(app) else: # render ccss, coffeescript and shpaml in 'templates' and 'static' dirs preprocessing.preprocess(app, request) # set FileSystemCache instead of Memcache for development # cache = werkzeug.contrib.cache.FileSystemCache(os.path.join(app.root_path, "cache")) # run application app.run() # EOF
def train_mp(window_size, input_file, label_file, num_examples, out_file, layers, nodes, subgraphs, classes, iterations, batch_size, training_rate): window_size = int(window_size) #print("window_Size=", window_size) #print("input_file=", input_file) #print("label_file", label_file) num_examples = int(num_examples) #print("num_examples=", num_examples) #print("out_file=",out_file) layers = int(layers) #print("layers=", layers) nodes = list(map(int, nodes)) #print("nodes=", nodes) subgraphs = list(map(int, subgraphs)) #print("subgraphs=", subgraphs) classes = int(classes) #print("classes=", classes) iterations = int(iterations) #print("iterations=",iterations) batch_size = int(batch_size) #print("batch size=",batch_size) training_rate = float(training_rate) #print("training_rate=",training_rate) #make sure length of lists is correct assert (layers == len(nodes)) assert (layers == len(subgraphs)) #define nodes[0] as the data_size and subgraphs[0] as 1 data_size = window_size * window_size print(data_size) nodes = [data_size] + nodes subgraphs = [1] + subgraphs #make sure that topology setup will work #check up to layers-1, the highest index for i in range(1, layers): assert (nodes[i - 1] % subgraphs[i] == 0) assert (nodes[i] % subgraphs[i] == 0) assert (classes % subgraphs[layers] == 0) #record time for training to run start_time = time.time() data_size = window_size * window_size # tf Graph input x = tf.placeholder("float", [None, (data_size)]) #inputs y_ = tf.placeholder("float", [None, classes]) #ground-truth labels #create variables to store weights and biases #create an h in weights and a b in biases for each layer in the model #h1 and b1 create create variables that each correspond to one of the subgraphs of # layer 1. There should be (subgraphs[1]) different subvariables created # in each. Each subvariable should be named "h1_[#]" or "b1_[#]", where "#" # is the subvariable number #h2 and b2 are the same as h1 and b1 except that they apply to the second # subconnected layer, as are h3 and b3 for the third and so on #the out variables control the input into the fully-connected final layer # and are named "out_weights" and "out_biases" #NOTE: THE NAMES ARE NECESSARY TO SAVE THE MODEL TO A FILE #start by initializing weights and biases with the out variables weights = { 'out': tf.Variable(tf.random_normal([int(nodes[layers]), int(classes)]), name="out_weights") } biases = { 'out': tf.Variable(tf.random_normal([int(classes)]), name="out_biases") } #add in the h and b variables for each hidden layer #note: you are creating subgraphs[i] subvariables in both wieghts and biases and #each of these subvariables is an array of length (nodes[i-1]/subgraphs[i]) which #stores a connection for that subgraph. #the s in range(0, subgraphs[i]) is creating multiple subvariables inside of each #weights[weights_name] or biases[biases_name] #for documentation on creating each of these subvariables, see # https://www.tensorflow.org/api_docs/python/tf/random_normal for i in range(1, layers + 1): weights_name = "h" + str(i) biases_name = "b" + str(i) weights[weights_name] = [ tf.Variable(tf.random_normal([ int((nodes[i - 1]) / subgraphs[i]), int(nodes[i] / subgraphs[i]) ]), name=(weights_name + "_" + str(s))) for s in range(0, subgraphs[i]) ] biases[biases_name] = [ tf.Variable(tf.random_normal([int((nodes[i]) / subgraphs[i])]), name=(biases_name + "_" + str(s))) for s in range(0, subgraphs[i]) ] #add variables to collection and initialize the saver #for each layer, add all of the subvariables for i in range(1, layers + 1): weights_name = "h" + str(i) + "_" biases_name = "b" + str(i) + "_" for s in range(subgraphs[i]): subweight_name = weights_name + str(s) #each should be "h(i)_(s)" subbias_name = biases_name + str(s) #each should be "b(i)_(s)" tf.add_to_collection('vars', subweight_name) tf.add_to_collection('vars', subbias_name) #add the out variables tf.add_to_collection('vars', "out_weights") tf.add_to_collection('vars', "out_biases") #initialize saver saver = tf.train.Saver() # Construct model y = multilayer_perceptron(x, layers, weights, biases, subgraphs) #y contains the predicted outputs #which will be compared to the #ground-truth, y_ # Define loss and optimizer cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)) optimizer = tf.train.AdamOptimizer( learning_rate=training_rate).minimize(cost) # Initializing the variables init = tf.global_variables_initializer() #get the generator for features and labels generator = preprocessing.preprocess(input_file, label_file, window_size) features = [] labels = [] for count, curr in enumerate(generator): if count >= num_examples: break curr_features = curr[0] curr_features = list(map(float, curr_features)) curr_labels = curr[1] curr_labels = list(map(float, curr_labels)) features.append(curr_features) labels.append(curr_labels) features = np.asarray(features) labels = np.asarray(labels) # Launch the graph with tf.Session() as sess: sess.run(init) # Training cycle for epoch in range(iterations): '''avg_cost = 0.''' #removed from example code to simplify total_batch = int(num_examples / batch_size) # Loop over all batches for i in range(total_batch): # Run optimization op (backprop) and cost op (to get loss value) sess.run([optimizer, cost], feed_dict={ x: features, y_: labels }) #removed avg_cost tracking for simplicity '''# Compute average loss avg_cost += int(c / total_batch)''' #c was collected from sess.run #removed this section from the example code for simplicity '''# Display logs per epoch step if epoch % display_step == 0: print("Epoch:", '%04d' % (epoch+1), "cost=", \ "{:.9f}".format(avg_cost))''' print("Optimization Finished!") end_time = time.time() #print training accuracy and training time curr_loss = sess.run([cost], feed_dict={x: features, y_: labels})[0] print("The training error was", (curr_loss / num_examples)) print("Optimization took %s seconds" % (end_time - start_time)) #output to out_file saver.save(sess, out_file)
centers = [ individual_features[i:(i + individual_length)] for i in range(0, len(individual_features), individual_length) ] distance_list = [] for a, b in itertools.combinations(centers, 2): d1 = distance.euclidean(a, b) distance_list.append(d1) Dc = max(distance_list) Ec = np.sum(individual.partition_matrix * individual.distance_matrix) PBM_index = math.pow((Dc / (individual.no_of_Cluster * Ec)), 2) return PBM_index individual_list = [] individual_length, individual_list = preprocess( 'Input_data/preprocessed_BCLL.txt') data_matrix_trans = preprocess_fcm_datamatrix(individual_length, individual_list) individual_no = len(individual_list) data_matrix = np.array(individual_list) print("##", data_matrix.shape) print(data_matrix_trans.shape) """ Input from user """ chromosome_number = int( sys.argv[1] ) # Enter the number of chromosome(individual) you want to generate generation_number = int(sys.argv[2]) #problem = Problem(num_of_variables=3, objectives=[f1, f2], variables_range=[(-5, 5)], same_range=True, expand=False)