def RateSimilarity(self, first_word, second_word): first_word = lower(first_word) second_word = lower(second_word) if first_word is not None and second_word is not None: if first_word == second_word: return self.default_match_score else: half_length = min(len(first_word), len(second_word)) / 2 + 1 common1 = self.GetCommonCharacters(first_word, second_word, half_length) common_matches = len(common1) if common_matches == 0: return self.default_match_score common2 = self.GetCommonCharacters(second_word, first_word, half_length) if common_matches != len(common2): return self.default_match_score transpositions = 0 for i in common_matches: if common1[i] != common2[i]: transpositions = transpositions + 1 transpositions = transpositions / 2 jaro_metric = common_matches / ( 3.0 * len(first_word) ) + common_matches / (3.0 * len(second_word)) + ( common_matches - transpositions) / (3.0 * common_matches) return jaro_metric return self.default_match_score
def getUserInputTF(prompt): # FUNCTION PURPOSE: Get a valid boolean (True or False) from user # # INPUTS: # prompt: String that is printed to the console to prompt user input # # OUTPUTS: # userInput: Boolean containing the user's answer to 'prompt' # Print the prompt to console, followed by the user's input options ("Y" or "N") print(prompt+" (Y/N)") # userInput starts as empty string userInput = "" # While userInput remains empty, get input while not userInput: userInput = input() # If input isn't either "Y" or "N", set userInput to empty string if lower(userInput) != "y" and lower(userInput) != "n": print("Please enter a valid answer (Y/N):") # Console output to let user know requirements userInput = "" # Now that the loop has finished, return True for "Y" and False for "N" if lower(userInput) == "y": return True else: return False
def ComputeClosestWordsSimilarity(self, s1, s2): max = 0 s1_closest_words = self.GetClosesWords(s1) s2_closest_wprds = self.GetClosesWords(s2) for ss1 in s1_closest_words: for ss2 in s2_closest_wprds: syn_similarity = self.w_2_vec_util.GetWord2VecSimilarity( str(lower(ss1)), str(lower(ss2))) if syn_similarity > max: max = syn_similarity return max
def getUserInputTF(prompt): print(prompt + " (Y/N)") #Prompts user for a Yes or No userInput = "" while not userInput: userInput = input() if lower(userInput) != "y" and lower(userInput) != "n": userInput = "Please enter a 'Y' or 'N'" if lower(userInput) == "y": return True else: return
def getUserInputUnits(): print("Enter the unit of time to use with this plot:") userInput = "" while not userInput: userInput = input() if lower(userInput) != "hours" and lower( userInput) != "days" and lower( userInput) != "months" and lower(userInput) != "years": print("Please enter either 'hours', 'days', 'months', or 'years':") userInput = "" return lower(userInput)
def load_source_rows(tab, names, key='assoc'): """Load the rows from a table that match a source name. Parameters ---------- tab : `astropy.table.Table` Table that will be searched. names : list List of source identifiers. key : str Name of the table column that will be searched for a source matching key. Returns ------- outtab : `astropy.table.Table` Table containing the subset of rows with matching source identifiers. """ names = [name.lower().replace(' ', '') for name in names] col = tab[[key]].copy() col[key] = defchararray.replace(defchararray.lower(col[key]), ' ', '') mask = create_mask(col, {key: names}) return tab[mask]
def getProblemSet(self, options): variance = options.sample_var sample_amount = options.sample_amount ttk = self.transition_kernel # index 1 and 2 are kernel indices, 3 is the sample index mu = _np.repeat(ttk[:, :, :, _np.newaxis], sample_amount, axis=3) # if we use variance scaling. # make a kernel for each variance between upper and lower limit if options.variance_scaling: variance = _np.divide( range(options.variance_lower, options.sample_amount), options.sample_amount / options.variance_upper) if lower(options.sample_method) == "uniform": # sample from uniform tk_low, tk_up = Interval.compute_interval(mu, variance) non_normalized_tks = _np.random.uniform(tk_low, tk_up) elif lower(options.sample_method) == "monte carlo": non_normalized_tks = monte_carlo_sampling( self.transition_kernel, sample_amount, options.monte_carlo_sampling_init_count_value, options.monte_carlo_sampling_random_samples) else: # sample from normal non_normalized_tks = _np.random.normal(mu, variance) problems_out = [] for i in range(sample_amount): tk = self.normalize_tk(non_normalized_tks[:, :, :, i]) for a in options.non_robust_actions: tk[a] = self.transition_kernel[a] distance = 0 for a in range(self.transition_kernel.shape[0]): for s in range(self.transition_kernel.shape[1]): distance += wasserstein_distance( tk[a][s], self.transition_kernel[a][s]) new_problem = Problem(tk, self.reward_matrix, self.discount_factor, self.name, distance) # new_problem.transition_kernel = tk problems_out.append(new_problem) return ProblemSet(problems_out, self, options, Sampling.ALL)
def find_i_nodes(g): nodes = dict(g.nodes) i_node_list = [] for key in nodes: if 'i' == lower(nodes[key]['type']): i_node_list.append((key, nodes[key])) return i_node_list
def dbscan(metric, eps, min_samples): dbscan = DBSCAN(metric=metric, eps=eps, min_samples=min_samples) configuration = [ str(lower(dbscan.__class__.__qualname__)), str(dbscan.metric), str(dbscan.min_samples), str(dbscan.eps) ] return configuration, dbscan
def affinity_propagation(number_init, max_iterations, damping): ap = AffinityPropagation(max_iter=max_iterations, convergence_iter=number_init, damping=damping) configuration = [ str(lower(ap.__class__.__qualname__)), str(ap.convergence_iter) ] return configuration, ap
def affichage(): """ Affichage de liste des filtres disponibles """ print(f"List des filtres disponibles (Tout en minuscules)") with os.scandir("filters") as entries: for entry in entries: var = entry.name.split(".py") for a in var: print(lower(a))
def kmeans(number_clusters, number_init, max_iterations): kmeans = KMeans(n_clusters=number_clusters, n_init=number_init, max_iter=max_iterations) configuration = [ str(lower(kmeans.__class__.__qualname__)), str(kmeans.n_clusters), str(kmeans.n_init) ] return configuration, kmeans
def agglomerative(number_clusters, affinity, linkage): ac = AgglomerativeClustering(n_clusters=number_clusters, affinity=affinity, linkage=linkage) configuration = [ str(lower(ac.__class__.__qualname__)), str(ac.n_clusters), str(ac.affinity), str(ac.linkage) ] return configuration, ac
def gaussian_mixture(number_clusters, number_init, max_iterations): gm = GaussianMixture(n_components=number_clusters, random_state=0, n_init=number_init, max_iter=max_iterations) configuration = [ str(lower(gm.__class__.__qualname__)), str(gm.n_components), str(gm.n_init) ] return configuration, gm
def spectral_clustering(affinity, assign_labels, number_init, number_neighbors): sc = SpectralClustering(affinity=affinity, assign_labels=assign_labels, n_init=number_init, n_neighbors=number_neighbors) configuration = [ str(lower(sc.__class__.__qualname__)), str(sc.n_init), str(sc.affinity), str(sc.assign_labels), str(sc.n_neighbors) ] return configuration, sc
def movie_wordcloud(df): title_df = df.select("id", "title") # Clean text df_clean = title_df.select( "id", lower(regexp_replace('title', "[^a-zA-Z\\s]", "")).alias('title')) # Tokenize text tokenizer = Tokenizer(inputCol='title', outputCol='words_token') df_words_token = tokenizer.transform(df_clean).select('id', 'words_token') # Remove stop words remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean') df_words_no_stopw = remover.transform(df_words_token).select( 'id', 'words_clean') #df_words_no_stopw.show(10) wordsDF = df_words_no_stopw.select(explode("words_clean").alias("words")) wordsDF = wordsDF.select(trim(wordsDF.words).alias("words")) #wordsDF.show() wordCountDF = wordsDF.groupBy("words").count().orderBy( desc("count")).limit(16) #wordCountDF.show() pandD = wordCountDF.toPandas() pandD.drop(0, inplace=True) sns.barplot(y='words', x='count', data=pandD) plt.title("Movie Title Analysis") plt.xlabel('Words Frequency') plt.ylabel('Words') #plt.show() wordCountDF = wordsDF.groupBy("words").count().orderBy( desc("count")).limit(101) pandD = wordCountDF.toPandas() pandD.drop(0, inplace=True) # drop first row wordcloudConvertDF = pandD.set_index('words').T.to_dict('records') wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=100, relative_scaling=0.5, colormap='Dark2') \ .generate_from_frequencies(dict(*wordcloudConvertDF)) plt.figure(figsize=(14, 10)) plt.imshow(wordcloud, interpolation="bilinear") plt.title("Words Cloud - Movie Titles") plt.axis('off') plt.show() """# Overview Cloud
def main(): print(color.BLUE + 'Execution Date: ' + color.END + Receipt_date) while True: req_user = lower( input( "Want to stay out of brain burns - try Chuck therapy? (y/n): ") ) if req_user == 'y': api_call() elif req_user == 'n': print( "I hope you are normal now.. A Last Joke for you enjoy it... ") api_call() break else: print(color.RED + "Please use y or n" + color.END) continue
def process_message(body): if 'format' not in body or 'path' not in body or 'loadTo' not in body: print('missing one or more fields in body object') return False format = lower(body['format']) file = body['path'] loadTo = body['loadTo'] if os.path.exists(file): if format == 'csv': return load_csv_to_db(file, loadTo) elif format == 'json': return load_json_to_db(file, loadTo) else: print("error - received {0} format, invoice format must be csv or json".format(format)) return False else: print("could not find given path") return False
def find_rows_by_string(tab, names, colnames=['assoc']): """Find the rows in a table ``tab`` that match at least one of the strings in ``names``. This method ignores whitespace and case when matching strings. Parameters ---------- tab : `astropy.table.Table` Table that will be searched. names : list List of strings. colname : str Name of the table column that will be searched for matching string. Returns ------- mask : `~numpy.ndarray` Boolean mask for rows with matching strings. """ mask = np.empty(len(tab), dtype=bool) mask.fill(False) names = [name.lower().replace(' ', '') for name in names] for colname in colnames: if colname not in tab.columns: continue col = tab[[colname]].copy() col[colname] = defchararray.replace( defchararray.lower(col[colname]).astype(str), ' ', '') for name in names: mask |= col[colname] == name return mask
def find_rows_by_string(tab, names, colnames=['assoc']): """Find the rows in a table ``tab`` that match at least one of the strings in ``names``. This method ignores whitespace and case when matching strings. Parameters ---------- tab : `astropy.table.Table` Table that will be searched. names : list List of strings. colname : str Name of the table column that will be searched for matching string. Returns ------- mask : `~numpy.ndarray` Boolean mask for rows with matching strings. """ mask = np.empty(len(tab), dtype=bool) mask.fill(False) names = [name.lower().replace(' ', '') for name in names] for colname in colnames: if colname not in tab.columns: continue col = tab[[colname]].copy() col[colname] = defchararray.replace(defchararray.lower(col[colname]).astype(str), ' ', '') for name in names: mask |= col[colname] == name return mask
def create_xml(config_df, isd, wftype): wf_name = "sit_aml_" + sys_name_l + "_" + country_l + "_" + wftype isd_tbl_list = config_df["isd"].tolist() for cnt, tab in enumerate(list(chunks(isd_tbl_list, SIZE))): if cnt == 0: workflow_name = wf_name + "_wf" workflow_file = wf_name + "_wf.xml" workflow_name_adhoc = wf_name + "_batch_wf" workflow_file_adhoc = wf_name + "_batch_wf.xml" else: workflow_name = wf_name + "_wf" + "_" + str(cnt) workflow_file = wf_name + "_wf" + "_" + str(cnt) + ".xml" workflow_name_adhoc = wf_name + "_batch_wf" + "_" + str(cnt) workflow_file_adhoc = wf_name + "_batch_wf" + "_" + str( cnt) + ".xml" create_file(sys_name_l, country_l, 'wf', workflow_name) create_file(sys_name_l, country_l, 'adhoc', workflow_name_adhoc) f_source_count = open(output_folder + '/sourcecount.txt', "a+") f_incremental = open(output_folder + '/' + workflow_file, "w+") f_adhoc = open(output_folder + '/' + workflow_file_adhoc, "w+") f_incremental.write(first + '\n') f_adhoc.write(first + '\n') for table in tab: table = table.strip() if len(table.strip()) == 0: print("ISD table name cannot be blank") os.remove('./running.script') sys.exit(1) tbl = config_df[config_df["isd"] == table] print(tbl) DATABASE = str(lower(tbl["database"].tolist()[0])).strip() table_name = str(lower(tbl["table_name"].tolist()[0])).strip() INC_WHERE = str(lower( tbl["inc_filtercondition"].tolist()[0])).strip() HIS_WHERE = str(lower( tbl["batch_filtercondition"].tolist()[0])).strip() output_file_name = '_'.join([country_u, sys_name_u, table.upper()]) create_file(sys_name_l, country_l, 'table', output_file_name.lower()) f_source_count.write( "select '" + output_file_name.upper() + "',COUNT(1),'${businessday}' FROM ${aml_sri_open}." + table_name.lower() + " WHERE " + INC_WHERE.lower() + " UNION ALL \n ") print("Table Name : " + table) f_incremental.write(header + '\n') f_adhoc.write(header + '\n') col_list = get_ISD_Col((isd.sheet_by_name(table))) print("No of columns: %i\n" % len(col_list)) all_cols = ','.join(col_list) for col in col_list: f_incremental.write(cols.replace('COLNAME', col) + '\n') f_adhoc.write(cols.replace('COLNAME', col) + '\n') inc_path_value = path.replace('user_name', USER_NAME).replace( 'output_file_name', output_file_name).replace('db_name', DATABASE).replace( 'table_name', table_name).replace('var_column_order', all_cols).replace( 'WHERE_CLAUSE', INC_WHERE) hist_path_value = path.replace('user_name', USER_NAME).replace( 'output_file_name', output_file_name).replace('db_name', DATABASE).replace( 'table_name', table_name).replace('var_column_order', all_cols).replace( 'WHERE_CLAUSE', HIS_WHERE) f_incremental.write(inc_path_value + '\n') f_adhoc.write(hist_path_value + '\n') trailer_value_inc = trailer.replace('dest_path', DEST_PATH).replace('wrk_name', workflow_name).\ replace('db_id', DB_ID).replace('storage', STORAGE) trailer_value_adhoc = trailer.replace('dest_path', DEST_PATH).replace('wrk_name', workflow_name_adhoc). \ replace('db_id', DB_ID).replace('storage', STORAGE) f_incremental.write(trailer_value_inc) f_adhoc.write(trailer_value_adhoc) f_incremental.close() f_adhoc.close() f_source_count.close()
# - The config file options would be not allowed to contain them for skip_option in {'config', 'help'}: del args[skip_option] return merge_args(args, config_args) if __name__ == '__main__': config = get_options() number_runs = int(config['number_runs']) verbose = bool(config['verbose']) classification_problem = bool(config['classification']) # Specify name of the dataset and percentage of the entire data volume to sample dataset_name = lower(config['dataset']) subsample_factor = float(config['subsample']) # Setting for the neural network model_name = str(lower(config['model'])) num_neurons = int(config['neurons']) num_layers = int(config['layers']) num_neurons_list = [num_neurons for i in range(num_layers)] activation = str(lower(config['activation'])) use_bias = bool(config['bias']) # Generic parameters for optimizer optimizer_name = str(lower(config['optimizer'])) epochs = int(config['epochs']) learning_rate = float(config['learning_rate']) threshold = float(config['threshold'])
createdON = createdArr[0] if len(nStartArr) > 0: startAt = nStartArr[0] startON = nStartArr[1] if len(endArr) > 0: endAt = endArr[0] #print(row['nEnd']) # endOn = endArr[1] #Removing all non-numeric characters from string in Python re.sub("[^0-9]", "", "sdkjh987978asd098as0980a98sd") #print(re.sub("[^0-9]", "", row['SumInsured'])) #re.sub("[^0-9]", "", row['Premium']) branch = lower(str(row['BinderNo'])[0:4]) if "kin" in str(lower(str(row['BinderNo'])[0:5])): branch = "Kingston" if "por" in str(lower(str(row['BinderNo'])[0:5])): branch = "Port Antonio" if "fal" in str(lower(str(row['BinderNo'])[0:5])): branch = "Falmouth" if "old" in str(lower(str(row['BinderNo'])[0:5])): branch = "Old Harbour" if "och" in str(lower(str(row['BinderNo'])[0:5])): branch = "Ocho Rios" if "con" in str(lower(str(row['BinderNo'])[0:5])): branch = "Constant Spring Road" if "spa" in str(lower(str(row['BinderNo'])[0:5])): branch = "Spanish Town"
def GetClosesWords(self, s1): closest_words = self.w_2_vec_util.GetClosestWords(lower(s1)) return closest_words
def main(): # game starts here players = [] # max_nb_cards = 2 # also the limit to regular move testing max_nb_cards = 12 # also the limit to regular move game = Game(False, max_nb_cards, players) objective = '' # auto gen for testing # game.__players.append(Player('kenlo', 1, game.__nb_cards)) # game.__players[0].setObjective('colors') # game.__players.append(Player('bot', 2, game.__nb_cards)) # game.__players[1].setObjective('dots') alpha_beta = None reply = '' while reply != 'y' or reply != 'n': msg = 'Activate Alpha-Beta ? (Y/N) ' usr_input = input(msg) reply = str(lower(usr_input)) if reply == 'y': alpha_beta = True break if reply == 'n': alpha_beta = False break trace = None reply = '' while reply != 'y' or reply != 'n': msg = 'Generate an Output Trace ? (Y/N) ' usr_input = input(msg) reply = str(lower(usr_input)) if reply == 'y': trace = True break if reply == 'n': trace = False break if trace: trace_file = game.createOutputFile(alpha_beta) trace_file.close() else: trace_file = None ai_turn = 0 while reply != '1' or reply != '2': msg = 'The A.I will be Player 1 or 2 ? (1/2) ' usr_input = input(msg) reply = usr_input if (reply == '1' or reply == '2'): ai_turn = int(reply) break for i in range(1, 3): if ai_turn == i: # bot = AIPlayer(2, 'Bot', i, max_nb_cards, 'ai') # bot = AIPlayer(2, 'Bot', i, max_nb_cards, 'ai') #todo: 2 LEVEL DEEP bot = AIPlayer(1, 'Bot', i, max_nb_cards, 'ai') #todo: 1 LEVEL DEEP # print('bot name = ', bot.name()) players.append(bot) else: msg = 'Player ' + str(i) + ' name: ' usr_input = input(msg) name = str(usr_input) player = Player(name, i, max_nb_cards, 'human') players.append(player) if i == 1: # if ai_turn == i: # random_int = randint(0,20) # if random_int % 2 == 0: # players[i - 1].setObjective('colors') # print('Bot chose colors as objective.') # else: # players[i - 1].setObjective('dots') # print('Bot chose dots as objective.') # # else: while objective != 'colors' or objective != 'dots': msg = 'Player ' + str(i) + ' objective (colors or dots): ' usr_input = input(msg) objective = str(lower(usr_input)) if objective == 'colors' or objective == 'dots': players[i - 1].setObjective(objective) break else: if (players[i - 2].objective() == 'colors' ): #player 1 at position 0 in list players[i - 1].setObjective( 'dots') #player 2 at position 1 in list elif (players[i - 2].objective() == 'dots'): players[i - 1].setObjective('colors') # print(players) # new blank board brd_1 = Board(8, 12) brd_1.setBoard() brd_1.printBoard() # cards history played_cards = [] # print(game.__hasWinner) turn_count = 0 # end_game = 30 end_game = 20 # regular turns loop while game.__hasWinner == False: turn_count += 1 print('Round #', turn_count, sep='') print('=' * 36) print('') turnP1 = Turn(alpha_beta, trace, turn_count, max_nb_cards, end_game, brd_1, players[0], played_cards, trace_file) turnResult = turnP1.start() game.checkResult(turnResult) if game.__hasWinner: break turnP2 = Turn(alpha_beta, trace, turn_count, max_nb_cards, end_game, brd_1, players[1], played_cards, trace_file) turnResult = turnP2.start() game.checkResult(turnResult) if game.__hasWinner: break if turn_count == end_game: print('Game Over. It ended in a draw.') break print('End of Round #', turn_count, sep='') print('=' * 36) print('')
points.append(left) points.append(right) #Checks if the program is closed out of. def isQuit(): for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() sys.exit() #Prompts the user if they want to see each segment of the snowflake be drawn or just each iteration be drawn. print("\n") st = input("slow draw? (Y/N): ") slowDraw = (lower(st) == "y") #Main Loop while True: isQuit() num = 0 resetPoints() while num < 6: isQuit() display_surf.fill(BLACK) if slowDraw: for i in range(len(points)): isQuit() if i == len(points) - 1: next = points[0] else:
def sentiment_model(): print("Predicting Sentiments") imdb_df_pd = session.execute('SELECT * FROM movie_sent') #imdb_df_pd = pd.read_csv("IMDB_Dataset.csv") for col in imdb_df_pd.columns: if imdb_df_pd[col].dtypes == 'object': imdb_df_pd[col] = imdb_df_pd[col].astype('str') #imdb_df_pd.head(10) imdb_df = sqlContext.createDataFrame(imdb_df_pd) # Print the schema in a tree format #imdb_df.printSchema() #Categorize sentiment to 0 or 1 indexer = StringIndexer(inputCol="sentiment", outputCol="score") imdb_df = indexer.fit(imdb_df).transform(imdb_df) imdb_df = imdb_df.drop("sentiment") #imdb_df.show() imdb_df = imdb_df.select( regexp_replace('review', '[!?.;:#-/<>]+', ' ').alias('review'), 'score') imdb_df = imdb_df.select( regexp_replace('review', '\"', ' ').alias('review'), 'score') imdb_df = imdb_df.select( regexp_replace('review', ',', ' ').alias('review'), 'score') imdb_df = imdb_df.select( regexp_replace('review', '(\'s\s+)', ' ').alias('review'), 'score') imdb_df = imdb_df.select( regexp_replace('review', '(\'\s+)', ' ').alias('review'), 'score') #imdb_df.show() # remove all single characters imdb_df = imdb_df.select( regexp_replace('review', '\s+[a-zA-Z]\s+', ' ').alias('review'), 'score') # remove single characters from the start imdb_df = imdb_df.select( regexp_replace('review', '^[a-zA-Z]\s+', '').alias('review'), 'score') # remove single digit imdb_df = imdb_df.select( regexp_replace('review', '[0-9]+', ' ').alias('review'), 'score') #Substituting multiple spaces with single space imdb_df = imdb_df.select( regexp_replace('review', '\s+', ' ').alias('review'), 'score') # Converting to Lowercase imdb_df = imdb_df.select(lower(imdb_df.review).alias('review'), 'score') #imdb_df.show() # Tokenize text tokenizer = Tokenizer(inputCol='review', outputCol='words_token') df = tokenizer.transform(imdb_df) # Remove stop words remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean', caseSensitive=False) # df = remover.transform(df) # df.show(10) cv = CountVectorizer(inputCol="words_clean", outputCol="tf", vocabSize=2**17, minDF=5.0) # we now create a pipelined transformer cv_pipeline = Pipeline(stages=[tokenizer, remover, cv]).fit(imdb_df) #cv_pipeline.transform(imdb_df).show(5) idf = IDF(inputCol="tf", outputCol="idf") idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(imdb_df) #idf_pipeline.transform(imdb_df).show(5) training_df, validation_df, testing_df = imdb_df.randomSplit( [0.6, 0.3, 0.1], seed=0) #print(training_df.count(), validation_df.count(), testing_df.count()) lr = LogisticRegression(maxIter=50, regParam=0.0, elasticNetParam=0.0, featuresCol="idf", labelCol="score") lr_pipeline = Pipeline(stages=[idf_pipeline, lr]).fit(training_df) print("Prediction Accuracy before Tuning") lr_pipeline.transform(validation_df). \ select(expr('float(prediction = score)').alias('correct')). \ select(avg('correct').alias('Accuracy')).show() # identify noise in the model vocabulary = cv_pipeline.stages[2].vocabulary # vocabulary = idf_pipeline.stages[0].stages[2].vocabulary #print(vocabulary) weights = lr_pipeline.stages[1].coefficients.toArray() #print(weights) coeffs_df = pd.DataFrame({'word': vocabulary, 'weight': weights}) #print(coeffs_df.sort_values('weight').head(5)) #print(coeffs_df.sort_values('weight', ascending=False).head(5)) # Fit the model # data is overfitted # modify the loss function and penalize weight values that are too large. # use either L! (Lasso) or Ridge(L2) from pyspark.ml.tuning import ParamGridBuilder #evlue the mode and find best fit model. done seperately # best parameters are regParam = 0.01 and name='elasticNetParam' = 0.2 lr = LogisticRegression(maxIter=50, regParam=0.01, elasticNetParam=0.2, featuresCol="idf", labelCol="score") lr_pipeline_fitted = Pipeline(stages=[idf_pipeline, lr]).fit(training_df) print("Prediction Accuracy - After Tuning") lr_pipeline_fitted.transform(validation_df). \ select(expr('float(prediction = score)').alias('correct')). \ select(avg('correct').alias('accuracy')).show() # identify noise in the model vocabulary = cv_pipeline.stages[2].vocabulary #print(vocabulary) weights = lr_pipeline_fitted.stages[1].coefficients.toArray() #print(weights) coeffs_df = pd.DataFrame({'word': vocabulary, 'weight': weights}) #print(coeffs_df.sort_values('weight').head(5)) #print(coeffs_df.sort_values('weight', ascending=False).head(5)) return lr_pipeline_fitted print("end of sentiment model")
def CleanString(string): new_string = ''.join(e for e in string if e.isalnum()) new_string = lower(new_string) return str(new_string)
def where_is(self, patterns, df=None, union=True, columns=None, exact=False, case_sensitive=False): """Find a list of string patterns in a DataFrame. Parameters ---------- patterns : list List of string patterns to search. df : pd.DataFrame | None The DataFrame to use. If None, the DataFrame of the ROI are going to be used by default. union : bool | True Take either the union of matching patterns (True) or the intersection (False). columns : list | None List of specific column names to search in. If None, this method search through the entire DataFrame. exact : bool | False Specify if the pattern to search have to be exact matching (True) or if the pattern is only a part of the result. case_sensitive : bool | False Specify if the search have to be case sensitive. Returns ------- idx : list List of index that match with the list of patterns. """ # Check inputs : assert isinstance(patterns, (str, list, tuple)) df_to_use = self.ref if df is None else df is_pandas_installed(raise_error=True) import pandas as pd assert isinstance(df_to_use, pd.DataFrame) patterns = [patterns] if isinstance(patterns, str) else patterns patterns = list(patterns) if columns is not None: df_to_use = df_to_use[columns] dfarr = np.array(df_to_use).astype(str) # Case sensitive : if not case_sensitive: dfarr = npchar.lower(dfarr) patterns = npchar.lower(np.array(patterns).astype(str)) # Define the matching function : if exact: def match(x, pat): return np.any(x == pat, axis=1) # noqa else: def match(x, pat): return np.any((npchar.find(x, pat) + 1).astype(bool), axis=1) # Locate patterns : idx_to_keep = np.zeros((dfarr.shape[0], len(patterns)), dtype=bool) for k, p in enumerate(patterns): idx_to_keep[:, k] = match(dfarr, str(p)) # Return either the union or intersection across research : fcn = np.any if union else np.all idx_to_keep = fcn(idx_to_keep, 1) if not np.any(idx_to_keep): logger.error("No corresponding entries in the %s ROI for " "%s" % (self.name, ', '.join(patterns))) return [] else: idx_roi = np.array(df_to_use['index'].loc[idx_to_keep]).astype(int) return idx_roi.tolist()
def compareMethods(waveR, waveC, parametersR, parametersC, regionR, regionC): # FUNCTION PURPOSE: Get user input to compare results from two methods based on their hodographs # # INPUTS: # wave: Dictionary containing wavelet transformed surfaces, for rectangle (R) and contour (C) methods # parameters: Dictionary containing wave parameters, for R and C methods # region: Boolean mask tracing the wave on the power surface, for R and C methods # # OUTPUTS: # parameters: Dictionary containing wave parameters, for the chosen method # region: Boolean mask tracing the wave on the power surface, for the chosen method # First, filter based on half-max wind variance, from Murphy (2014) # Calculate the wind variance of the wave windVarianceR = np.abs(waveR.get('uTrim')) ** 2 + np.abs(waveR.get('vTrim')) ** 2 windVarianceC = np.abs(waveC.get('uTrim')) ** 2 + np.abs(waveC.get('vTrim')) ** 2 # Get rid of values below half-power, per Murphy (2014) uR = waveR.get('uTrim').copy()[windVarianceR >= 0.5 * np.max(windVarianceR)] vR = waveR.get('vTrim').copy()[windVarianceR >= 0.5 * np.max(windVarianceR)] uC = waveR.get('uTrim').copy()[windVarianceC >= 0.5 * np.max(windVarianceC)] vC = waveR.get('vTrim').copy()[windVarianceC >= 0.5 * np.max(windVarianceC)] # Discard imaginary components, which aren't needed for hodograph uR = uR.real vR = vR.real uC = uC.real vC = vC.real # Now, create hodograph subplots for easy comparison fig, ax = plt.subplots(1, 2) fig.suptitle('Which Hodograph Looks Better?') ax[0].plot(uR, vR) ax[0].set_title('Rectangle Peak Trace Method') ax[1].plot(uC, vC) ax[1].set_title('Contour Peak Trace Method') plt.show() # Get user input for selection print("\r\nPlease enter the name of the method that showed a more elliptical shape:") # userInput starts as empty string userInput = "" # While userInput remains empty, get input while not userInput: userInput = lower(input()) # If input isn't either "rectangle" or "contour", set userInput to empty string if userInput != "rectangle" and userInput != "contour" and userInput != "r" and userInput != "c": # Console output to let user know requirements if they don't answer right print("Please enter either 'rectangle' or 'contour':") userInput = "" # Now that the loop has finished, return correct parameters and region if userInput == "rectangle" or userInput == "r": return parametersR, regionR else: return parametersC, regionC
def evaluate(base_data_path, movement_data_path, settings, real_change, CALCULATE_ERROR): list_index = [] list_R = [] list_t = [] list_inliers = [] movement_type = lower(base_data_path.split("/")[3]) print(base_data_path) print(movement_data_path) for image_i in range(1, 100): print(image_i) # # Predict Pose Change # try: _, final_r, final_t, inliers, _ = predict_pose_change(base_data_path.format(image_i), movement_data_path.format(image_i), settings, real_change, CALCULATE_ERROR=False, print_image=True) list_index.append(image_i) list_R.append(final_r) list_t.append(final_t) list_inliers.append(inliers) except Exception as e: print("some exception occured. skipping.") R_bar = np.array(list_R) t_bar = np.array(list_t) XYZ_mean = t_bar.mean(axis=0) XYZ_std = t_bar.std(axis=0) RPY_mean = R_bar.mean(axis=0) RPY_std = R_bar.std(axis=0) # # Plot CDF # # if movement_type == 'translation': plt.figure(figsize=[7, 7]) plt.subplot(2,1,1) plot_cdf((t_bar), "Translation", movement_type, real_change, xyz=True) # else: plt.subplot(2,1,2) plot_cdf((R_bar), "Rotation", movement_type, real_change, rpy=True) plt.show() # # Plot all predictions per Rotation and Translation # plt.figure(figsize=(12, 8)) plt.subplot(1, 2, 1) plot_means(R_bar, RPY_mean, RPY_std, "Rotation") plt.xlabel("image-pair index") plt.ylabel("prediction") plt.subplot(1, 2, 2) plot_means(t_bar, XYZ_mean, XYZ_std, "Translation") plt.ylabel("prediction") plt.xlabel("image-pair index") plt.suptitle("{}\n{}".format(base_data_path, movement_data_path)) plt.show() plt.plot((t_bar)) for i in range(len(list_index)): print() plt.text(i, (t_bar)[i][0], "#" + str(list_index[i])) plt.legend(["X", "Y", "Z"]) plt.ylabel("prediction") plt.show() plt.plot((R_bar)) for i in range(len(list_index)): print() plt.text(i, (R_bar)[i][0], "#" + str(list_index[i])) plt.legend(["φ", "θ", "ψ", ]) plt.ylabel("prediction") plt.show() # # Plot number of inliers used. # print() print() X = [sum(x) for x in list_inliers] X_indices = np.argsort(X) X = np.array(X)[X_indices] M = np.array(list_index)[X_indices] plt.plot(range(len(M)), X, 'ro') plt.plot(range(len(M)), X, 'k.') for i in range(len(M)): print("list_index", M) print(i) plt.text(i, X[i] + 0.25, str(M[i])) plt.xlabel("image-pair index") plt.ylabel("Number of inliers") plt.title("number of inliers (sorted)") plt.show() print("X_indices", X_indices) print("list_index", M) # # Plot number of inliers used. # print() print() X = [sum(x) for x in list_inliers] X_indices = np.argsort(X) X = np.array(X)[X_indices] M = np.array(list_index)[X_indices] plt.plot(range(len(M)), X, 'ro') plt.plot(range(len(M)), X, 'k.') for i in range(len(M)): print("list_index", M) print(i) plt.text(i, X[i] + 0.25, str(M[i])) plt.xlabel("image-pair index") plt.ylabel("Number of inliers") plt.title("number of inliers (sorted)") plt.show() print("X_indices", X_indices) print("list_index", M) return [ XYZ_mean[0], XYZ_std[0], # X XYZ_mean[1], XYZ_std[1], # Y XYZ_mean[2], XYZ_std[2], # Z RPY_mean[0], RPY_std[0], # φ RPY_mean[1], RPY_std[1], # θ RPY_mean[2], RPY_std[2], # ψ ]
def run(): # Create a SmmryAPI call and pass in SMMRY API Key smmry = SmmryAPI(SMMRY_API_KEY) # text wrapper wrapper = TextWrapper(width=200, initial_indent=" ", subsequent_indent=" ") while True: os.system('clear') printBanner() query = input(BOLD + 'Enter search query: ' + RESET) connection = urllib.request.urlopen( 'http://localhost:8983/solr/csce470/select?q=' + urllib.parse.quote_plus(query) + '&rows=100000') response = json.load(connection) numFound = response['response']['numFound'] # print query data print(' + query time: ' + YELLOW + str(response['responseHeader']['QTime']) + ' ms' + RESET) print(' + documents found: ' + YELLOW + str(numFound) + RESET) index = 0 while True: # 10 search results end = 10 if (index + 10) >= numFound: end = 10 - ((index + 10) - numFound) print(' + search results (showing ' + str(index + 1) + '-' + str(index + end) + ' out of ' + str(numFound) + '):') for x in range(index, index + end): try: print('{0: <223}'.format( (BG_WHITE + BLACK + ' [' + str(x + 1) + ']: ' + RESET + BG_WHITE + BLACK + str(response['response']['docs'][x]['title']))) + RESET) if DONT_CALL_API == False: article = smmry.summarize(str( response['response']['docs'][x]['url']), sm_length=SUMMARY_LEN) print( wrapper.fill(('Summary: ' + GREEN + str(article.sm_api_content))) + RESET) else: print('Summary: [DONT_CALL_API set to False]') except SmmryAPIException as e: #print(' error ' + str(e)) print(wrapper.fill(RED + '[Could not summarize]' + RESET)) except Exception as e: print(wrapper.fill(RED + '[Could not summarize]' + RESET)) finally: print( wrapper.fill( 'URL: ' + UNDERLINE + BLUE + str(response['response']['docs'][x]['url']) + RESET)) # next steps loop answer = '' while True: answer = input(BOLD + 'View more? (prev/next/new): ' + RESET) if lower(answer) == 'prev': if (index - 10) < 0: print('Can\'t go back anymore. Try again.') else: index = index - 10 break elif lower(answer) == 'next': if (index + 10) >= numFound: print('Can\'t go forward anymore. Try again.') else: index = index + 10 break elif lower(answer) == 'new': break else: print('Invalid command. Try again.') # start new query if answer == 'new': break else: os.system('clear') printBanner() print(BOLD + 'Enter search query: ' + RESET + query) print()