def __init__(self): self.X_train = None self.feature_names = None print('{}: Loading the data '.format((time.asctime(time.localtime(time.time()))))) self.featuresDF = pd.read_excel('FinalFeatures.xlsx') self.labels = self.featuresDF['IsEfficient'] self.submission_author_features = ['submission_author_number_original_subreddit', 'submission_author_number_recommend_subreddit', 'submission_created_time_hour'] self.sub_comment_author_relation_features = ['cosine_similarity_subreddits_list', 'comment_submission_similarity', 'comment_title_similarity'] self.comment_author_features =['comment_author_number_original_subreddit', 'comment_author_number_recommend_subreddit', 'percent_efficient_references_comment_author', 'number_of_references_comment_author'] self.comment_features = ['comment_created_time_hour', 'submission_created_time_hour', 'time_between_messages', 'comment_len', 'number_of_r', 'number_of_references_to_submission'] self.subreddit_features = ['number_of_references_to_recommended_subreddit', 'subreddits_similarity'] # self.subreddit_features = self.featuresDF['number_of_references_to_recommended_subreddit'] self.group_dic = {0: [self.submission_author_features, 'submission_author_features'], 1: [self.sub_comment_author_relation_features, 'sub_comment_author_relation_features'], 2: [self.comment_author_features, 'comment_author_features'], 3: [self.comment_features, 'comment_features'], 4: [self.subreddit_features, 'subreddit_features']} print('{}: Data loaded '.format((time.asctime(time.localtime(time.time()))))) return
def create_data_no_feature_selection(self): """ This function create the data for the models if not using feature selection :return: """ selected_features = list(self.group_dict.keys()) features_group = [ self.group_dict[group][0] for group in selected_features ] self.features = [ item for sublist in features_group for item in sublist ] features = [item for sublist in features_group for item in sublist] features.append('group_number') self.X_train = self.featuresDF[features] features_names = [ self.group_dict[feature][1] for feature in selected_features ] print('{}: Start training with the groups: {}'.format( (time.asctime(time.localtime(time.time()))), features_names)) logging.info('{}: Start training with the groups: {}'.format( (time.asctime(time.localtime(time.time()))), features_names)) group_results = self.models_iteration() for model in group_results: model.append(features_names) model.append(opts.k_fold) columns_names = [ 'classifier_name', 'score', 'auc', 'train_time', 'features_list', 'k_fold' ] group_results_df = pd.DataFrame(group_results, columns=columns_names) return group_results_df
def __init__(self): self.X_train = None self.features = None self.feature_names = None print('{}: Loading the data: FinalFeatures_with_comment_time'.format( (time.asctime(time.localtime(time.time()))))) self.original_data = pd.read_excel( 'FinalFeatures_with_comment_time.xlsx') self.labels = None self.featuresDF = None # self.featuresDF['percent_efficient_references_comment_author'].astype(str) # self.featuresDF.to_csv('sorted_group.csv', encoding='utf-8') self.submission_author_features = [ 'submission_author_number_original_subreddit', 'submission_author_number_recommend_subreddit', 'submission_created_time_hour' ] self.sub_comment_author_relation_features = [ 'cosine_similarity_subreddits_list', 'comment_submission_similarity', 'comment_title_similarity' ] self.comment_author_features = [ 'comment_author_number_original_subreddit', 'comment_author_number_recommend_subreddit', # 'percent_efficient_references_comment_author', 'number_of_references_comment_author' ] self.comment_features = [ 'comment_created_time_hour', 'time_between_messages', 'comment_len', 'number_of_r', 'number_of_references_to_submission' ] self.subreddit_features = [ 'number_of_references_to_recommended_subreddit', 'subreddits_similarity' ] # for 50Doc2Vec: # self.text_features = range(50) # for Word2Vec and 100Doc2Vec: self.text_features = range(100) self.group_dic = { 0: [self.submission_author_features, 'submission_author_features'], 1: [ self.sub_comment_author_relation_features, 'sub_comment_author_relation_features' ], 2: [self.comment_author_features, 'comment_author_features'], 3: [self.comment_features, 'comment_features'], 4: [self.subreddit_features, 'subreddit_features'], 5: [self.text_features, 'text_features'] } print('{}: Data loaded '.format( (time.asctime(time.localtime(time.time()))))) return
def split_relevant_data(self, Peff_up_threshold, Peff_down_threshold): self.featuresDF = self.original_data.loc[ (self.original_data['percent_efficient_references_comment_author'] <= Peff_up_threshold) & (self.original_data['percent_efficient_references_comment_author'] >= Peff_down_threshold)] # Split the data to k=15 groups, each comment_author in one group only i = 0 number_sample_group = 0 if Peff_up_threshold == 50.0 or Peff_up_threshold == 60.0 or Peff_up_threshold == 100.0: opts.k_fold = 4 sample_per_group = self.featuresDF.shape[0] / opts.k_fold last_comment_author = '' for index, row in self.featuresDF.iterrows(): if number_sample_group < sample_per_group: self.featuresDF.set_value(index, 'group_number', i) number_sample_group += 1 last_comment_author = row['comment_author'] else: if last_comment_author != row['comment_author']: i += 1 self.featuresDF.set_value(index, 'group_number', i) print( '{}: finish split samples for group number {} with {} samples' .format((time.asctime(time.localtime(time.time()))), i - 1, number_sample_group)) print('{}: start split samples for group number {}'.format( (time.asctime(time.localtime(time.time()))), i)) logging.info( '{}: finish split samples for group number {} with {} samples' .format((time.asctime(time.localtime(time.time()))), i - 1, number_sample_group)) logging.info( '{}: start split samples for group number {}'.format( (time.asctime(time.localtime(time.time()))), i)) last_comment_author = row['comment_author'] number_sample_group = 1 else: self.featuresDF.set_value(index, 'group_number', i) number_sample_group += 1 last_comment_author = row['comment_author'] print('{}: {} group is larger, number of samples is: {}'. format((time.asctime(time.localtime(time.time()))), i, number_sample_group)) print('{}: finish split samples for group number {} with {} samples'. format((time.asctime(time.localtime(time.time()))), i, number_sample_group)) logging.info( '{}: finish split samples for group number {} with {} samples'. format((time.asctime(time.localtime(time.time()))), i, number_sample_group)) opts.k_fold = i + 1 self.labels = self.featuresDF[['IsEfficient', 'group_number']] print('{}: Finish split the data for Peff between: {} and {}'.format( (time.asctime(time.localtime(time.time()))), Peff_down_threshold, Peff_up_threshold)) logging.info( '{}: Finish split the data for Peff between: {} and {}'.format( (time.asctime(time.localtime(time.time()))), Peff_down_threshold, Peff_up_threshold))
def iterate_over_features_groups(self, peff_up_threshold, peff_down_threshold): all_groups_results = pd.DataFrame() for number_of_groups in range(1, 7): feature_list = list(combinations(range(0, 6), number_of_groups)) for groups in feature_list: if 5 not in groups: continue # compare 2 features in group 2: # if groups != (2,3): # continue features_group = [self.group_dic[group][0] for group in groups] self.features = [ item for sublist in features_group for item in sublist ] features = [ item for sublist in features_group for item in sublist ] features.append('group_number') self.X_train = self.featuresDF[features] group_names = [self.group_dic[group][1] for group in groups] print('{}: Start training with the groups: {} '.format( (time.asctime(time.localtime(time.time()))), group_names)) logging.info('{}: Start training with the groups: {} '.format( (time.asctime(time.localtime(time.time()))), group_names)) group_results = self.ModelsIteration() print('{}: Finish training with the groups: {}'.format( (time.asctime(time.localtime(time.time()))), group_names)) logging.info('{}: Finish training with the groups: {}'.format( (time.asctime(time.localtime(time.time()))), group_names)) for model in group_results: model.append(group_names) model.append(opts.k_fold) model.append(peff_up_threshold) model.append(peff_down_threshold) columns_names = [ 'classifier_name', 'score', 'auc', 'train_time', 'group_list', 'k_fold', 'Peff_up_threshold', 'Peff_down_threshold' ] group_resultsDF = pd.DataFrame(group_results, columns=columns_names) # group_results.append(group_names).append([opts.k_fold]) all_groups_results = all_groups_results.append( group_resultsDF, ignore_index=True) all_groups_results.to_csv('test_results.csv', encoding='utf-8') # all_groups_results.to_csv('test_results_final_both.csv', encoding='utf-8') return all_groups_results
def __init__(self): self.X_train = None self.features = None self.feature_names = None print('{}: Loading the data: 100w2v_scale_2_causality'.format( (time.asctime(time.localtime(time.time()))))) self.original_data = pd.read_excel('100w2v_scale_2_causality.xlsx') self.labels = None self.featuresDF = None # for 50Doc2Vec: # self.text_features = range(50) # for Word2Vec and 100Doc2Vec: self.text_features = range(100) self.group_dic = { 0: [['submission_author_number_original_subreddit'], 'submission_author_number_original_subreddit'], 1: [['submission_author_number_recommend_subreddit'], 'submission_author_number_recommend_subreddit'], 2: [['submission_created_time_hour'], 'submission_created_time_hour'], 3: [['cosine_similarity_subreddits_list'], 'cosine_similarity_subreddits_list'], 4: [['comment_submission_similarity'], 'comment_submission_similarity'], 5: [['comment_title_similarity'], 'comment_title_similarity'], 6: [['comment_author_number_original_subreddit'], 'comment_author_number_original_subreddit'], 7: [['comment_author_number_recommend_subreddit'], 'comment_author_number_recommend_subreddit'], 8: [['number_of_references_comment_author'], 'number_of_references_comment_author'], 9: [['comment_created_time_hour'], 'comment_created_time_hour'], 10: [['time_between_messages'], 'time_between_messages'], 11: [['comment_len'], 'comment_len'], 12: [['number_of_r'], 'number_of_r'], 13: [['number_of_references_to_submission'], 'number_of_references_to_submission'], 14: [['number_of_references_to_recommended_subreddit'], 'number_of_references_to_recommended_subreddit'], 15: [['subreddits_similarity'], 'subreddits_similarity'], 16: [['treated'], 'treated'] # 16: [self.text_features, 'text_features'] } print('{}: Data loaded '.format( (time.asctime(time.localtime(time.time()))))) return
def fillModels(cv, mname, fname, comment=None): import os import time import stat if (comment == None): comment = " " pmmlfile = file(fname) sql = 'SELECT CURDATE()' cv.execute(sql) date = cv.fetchone()[0] atime = os.stat(fname)[stat.ST_ATIME] atime = time.asctime(time.localtime(atime)) ctime = os.stat(fname)[stat.ST_CTIME] ctime = time.asctime(time.localtime(ctime)) mtime = os.stat(fname)[stat.ST_MTIME] mtime = time.asctime(time.localtime(mtime)) mode = os.stat(fname)[stat.ST_MODE] mode = oct(mode & 0777) # we were using the mysql specific LOAD_FILE, but it # wasn't working in Korea, so we're doing the file load # the hard way load_file = file(fname, "rb") file_content = load_file.read() load_file.close() sql = "INSERT INTO models VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" try: login = getlogin() except: login = '******' cv.execute (sql, ( mname \ , file_content \ , comment \ , str(os.getuid()) \ , login \ , str(os.getgid()) \ , atime \ , mtime \ , ctime \ , str(mode) \ )) return
def fillModels(cv,mname,fname,comment=None): import os import time import stat if (comment==None): comment=" " pmmlfile=file(fname) sql='SELECT CURDATE()' cv.execute(sql) date=cv.fetchone()[0] atime=os.stat(fname)[stat.ST_ATIME] atime=time.asctime(time.localtime(atime)) ctime=os.stat(fname)[stat.ST_CTIME] ctime=time.asctime(time.localtime(ctime)) mtime=os.stat(fname)[stat.ST_MTIME] mtime=time.asctime(time.localtime(mtime)) mode=os.stat(fname)[stat.ST_MODE] mode=oct(mode & 0777) # we were using the mysql specific LOAD_FILE, but it # wasn't working in Korea, so we're doing the file load # the hard way load_file = file (fname, "rb") file_content = load_file.read () load_file.close () sql="INSERT INTO models VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" try: login=getlogin() except: login='******'; cv.execute (sql, ( mname \ , file_content \ , comment \ , str(os.getuid()) \ , login \ , str(os.getgid()) \ , atime \ , mtime \ , ctime \ , str(mode) \ )) return
def benchmark(self, clf, clf_name='default'): print('_' * 80) print('{}: Traininig: {}'.format((time.asctime(time.localtime(time.time()))), clf)) logging.info('_' * 80) logging.info('{}: Traininig: {}'.format((time.asctime(time.localtime(time.time()))), clf)) t0 = time.time() # Cross validation part k = opts.k_fold if clf_name == 'GaussianNB': self.X_train = self.X_train.toarray() predicted = cross_val_predict(clf, self.X_train, self.labels, cv=k) score = metrics.accuracy_score(self.labels, predicted) train_time = time.time() - t0 print("cross validation time: {}".format(train_time)) logging.info("cross validation time: {}".format(train_time)) # if hasattr(clf, 'coef_'): # print("dimensionality: %d" % clf.coef_.shape[1]) # print("density: %f" % density(clf.coef_)) # if opts.print_top10 and self.feature_names is not None: # print("top 10 keywords per class:") # for i, label in enumerate(self.labels): # top10 = np.argsort(clf.coef_[i])[-10:] # print(trim("%s: %s" % (label, " ".join(self.feature_names[top10])))) # print() # if True: # opts.print_report: # print("classification report:") # print(metrics.classification_report(self.labels, predicted, # self.labels=self.labels)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(self.labels, predicted, labels=[-1, 1])) logging.info("confusion matrix:") logging.info(metrics.confusion_matrix(self.labels, predicted, labels=[-1, 1])) clf_descr = str(clf).split('(')[0] print("Accuracy: {} (+/- {})".format(score.mean(), score.std() * 2)) logging.info("Accuracy: {} (+/- {})".format(score.mean(), score.std() * 2)) auc = metrics.roc_auc_score(self.labels, predicted, average='samples') print('AUC: {}'.format(auc)) logging.info('AUC: {}'.format(auc)) return [clf_descr, score, auc, train_time]
def get(self): template_vars = {"timeofday" : time.asctime(), "filepath" : os.path.dirname(__file__), "somevalue" : 1.0} template = jinja_environment.get_template("templates/hello.html") self.response.write(template.render(template_vars)) self.response.write("Hello world") self.response.write("<br>") self.response.write('<a href="/add?firstNum=23&secondNum=7"> Add 23 and 7 </a>')
def get(self): template_vars = { "timeofday": time.asctime(), "filepath": os.path.dirname(__file__), "somevalue": 1.0 } template = jinja_environment.get_template("templates/hello.html") self.response.write(template.render(template_vars)) self.response.write("Hello world") self.response.write("<br>") self.response.write( '<a href="/add?firstNum=23&secondNum=7"> Add 23 and 7 </a>')
def create_subreddit_data(self): print('{}: Start calculate subreddit dictionary'.format((time.asctime(time.localtime(time.time()))))) for index, comment in self.all_data.iterrows(): title = comment['title'] if isinstance(title, str): title.encode('utf-8') if not isinstance(title, str) or title in ['[removed]', '[deleted]']: title = ' ' else: title = ' ' submission_body = comment['submission_body'] if isinstance(submission_body, str): submission_body.encode('utf-8') if not isinstance(submission_body, str) or submission_body in ['[removed]', '[deleted]']: submission_body = ' ' else: submission_body = ' ' comment_body = comment['comment_body'] if isinstance(comment_body, str): comment_body.encode('utf-8') if not isinstance(comment_body, str) or comment_body in ['[removed]', '[deleted]']: comment_body = ' ' else: comment_body = ' ' concat_text = title + ' ' + submission_body + ' ' + comment_body subreddit = comment['subreddit'] if isinstance(subreddit, str): subreddit.encode('utf-8') if subreddit in self.subreddit_dict.keys(): self.subreddit_dict[subreddit] = self.subreddit_dict[subreddit] + concat_text else: self.subreddit_dict[subreddit] = concat_text with open('subreddit_dict.pickle', 'wb') as handle: pickle.dump(self.subreddit_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) print('{}: Finish calculate and save subreddit dictionary'.format((time.asctime(time.localtime(time.time()))))) return
def topic_model(self): """ Calculate the topic model for all the units, the probability that the comment has each of the topics :return: pandas DF[number_of_units, number_of_topics] - the probability for each comment and topic """ # Clean the data print('{}: Clean the data'.format((time.asctime(time.localtime(time.time()))))) units_clean = {row['comment_id']: clean(row['comment_body']).split() for index, row in self.units.iterrows()} all_data_clean = {row['comment_id']: clean(row['comment_body']).split() for index, row in self.all_data.iterrows()} # Creating the term dictionary of our corpus, where every unique term is assigned an index. print('{}: Create the dictionary'.format((time.asctime(time.localtime(time.time()))))) dictionary = corpora.Dictionary(all_data_clean.values()) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. print('{}: Create units term matrix'.format((time.asctime(time.localtime(time.time()))))) units_term_matrix = {index: dictionary.doc2bow(doc) for index, doc in units_clean.items()} print('{}: Create all data term matrix'.format((time.asctime(time.localtime(time.time()))))) all_data_term_matrix = {index: dictionary.doc2bow(doc) for index, doc in all_data_clean.items()} # Create LDA model print('{}: Create model'.format((time.asctime(time.localtime(time.time()))))) model = ldamodel.LdaTransformer(num_topics=self.number_of_topics, id2word=dictionary, passes=50, minimum_probability=0) # Train LDA model on the comments term matrix. print('{}: Fit the model on all data'.format((time.asctime(time.localtime(time.time()))))) model = model.fit(list(all_data_term_matrix.values())) # Get topics for the data print('{}: Predict topics for units'.format((time.asctime(time.localtime(time.time()))))) result = model.transform(list(units_term_matrix.values())) print('{}: Create final topic model data'.format((time.asctime(time.localtime(time.time()))))) comment_ids_df = pd.DataFrame(list(units_term_matrix.keys()), columns=['comment_id']) result_columns = ['topic_model_'+str(i) for i in range(self.number_of_topics)] topic_model_result_df = pd.DataFrame(result, columns=result_columns) print('{}: Save final topic model data'.format((time.asctime(time.localtime(time.time()))))) topic_model_final_result = pd.concat([comment_ids_df, topic_model_result_df], axis=1) return topic_model_final_result
def split_relevant_data(self): """ This function split the data to opts.k_fold folders and insert the group number to the DF :return: """ # Split the data to k=opts.k_fold groups, each comment_author in one group only i = 1 number_sample_group = 0 sample_per_group = math.floor(self.featuresDF.shape[0] / opts.k_fold) self.featuresDF = self.featuresDF.sample(frac=1).reset_index(drop=True) for index, row in self.featuresDF.iterrows(): if number_sample_group < sample_per_group or i == opts.k_fold: self.featuresDF.set_value(index, 'group_number', i) number_sample_group += 1 else: i += 1 self.featuresDF.set_value(index, 'group_number', i) print( '{}: finish split samples for group number {} with {} samples' .format((time.asctime(time.localtime(time.time()))), i - 1, number_sample_group)) print('{}: start split samples for group number {}'.format( (time.asctime(time.localtime(time.time()))), i)) logging.info( '{}: finish split samples for group number {} with {} samples' .format((time.asctime(time.localtime(time.time()))), i - 1, number_sample_group)) logging.info( '{}: start split samples for group number {}'.format( (time.asctime(time.localtime(time.time()))), i)) number_sample_group = 1 opts.k_fold = i + 1 # print for the last group print('{}: finish split samples for group number {} with {} samples'. format((time.asctime(time.localtime(time.time()))), i, number_sample_group)) logging.info( '{}: finish split samples for group number {} with {} samples'. format((time.asctime(time.localtime(time.time()))), i, number_sample_group)) self.labels = self.featuresDF[[self.label_column_name, 'group_number']] print('{}: Finish split the data'.format( (time.asctime(time.localtime(time.time()))))) logging.info('{}: Finish split the data'.format( (time.asctime(time.localtime(time.time())))))
def index(request): times = time.asctime( time.localtime(time.time()) ) if request.method =="POST": number1 = request.POST.get("number") print number1 if number1 is not None: number = number1 add_list = get_list(number) return render(request, 'index.html',{'add_list':add_list,'number':number,'time':times}) else: number = random_number() add_list = get_list(number) return render(request, 'index.html',{'add_list':add_list,'number':number,'time':times}) else: number = random_number() add_list = get_list(number) return render(request, 'index.html',{'add_list':add_list,'number':number,'time':times})
def save_results(self): """Saves everything to a file""" padding = 24 # Padding for each of the data points header = "SQC self test measurement file \n Date: {} \n Operator: {} \n\n".format( time.asctime(), self.main.framework["Configs"]["config"]["settings"].get( "Current_operator", "None"), ) empttykeys = list(self.data["Empty"].keys()) Cardkeys = list(self.data["TestCard"].keys()) measurements = list(empttykeys) measurements.extend(list(Cardkeys)) units = [ "#".ljust(padding), ] # Append units: for meas in measurements: header += meas.ljust(padding) units.append(self.data["units"].get(meas, "arb. units").ljust(padding)) header += "\n" + "".join(units) finalarray = np.ones(shape=(self.samples, (len(empttykeys) + len(Cardkeys)))) # Add empty meas i = 0 for meas in empttykeys: finalarray[:, i] = self.data["Empty"][meas] i += 1 # Add Test card for meas in Cardkeys: finalarray[:, i] = self.data["TestCard"][meas] i += 1 filecontent = "\n" for line in finalarray: for entry in line: filecontent += str(entry).ljust(padding) filecontent += "\n"
def benchmark(self, model, model_name='default'): print('_' * 80) print('{}: Traininig: {}'.format( (time.asctime(time.localtime(time.time()))), model_name)) print(model) t0 = time.time() # Cross validation part k = 100 predicted = cross_val_predict(model, self.data, self.labels, cv=k) score = metrics.accuracy_score(self.labels, predicted) train_time = time.time() - t0 print("train and test time: {}".format(train_time)) print("confusion matrix:") print(metrics.confusion_matrix(self.labels, predicted, labels=[-1, 1])) model_descr = str(model).split('(')[0] print("Accuracy: {} (+/- {})".format(score.mean(), score.std() * 2)) auc = metrics.roc_auc_score(self.labels, predicted, average='samples') print('AUC: {}'.format(auc)) return [model_descr, score, auc, train_time]
def __init__(self): print('{}: Loading the data'.format( (time.asctime(time.localtime(time.time()))))) # print '{}: Loading the data'.format((time.asctime(time.localtime(time.time())))) sentences = [] labels = [] sentences_len = [] true_index = [] false_index = [] stripComment = lambda x: x.strip().lower() replaceComments = lambda x: x.replace(";", ' ').replace(":", ' ').replace('"', ' ').replace('-', ' ').\ replace(',', ' ').replace('.', ' ').replace("/", ' ').replace('(', ' ').replace(')', ' ') splitCommant = lambda x: x.split(" ") stop = stopwords.words('english') stopWordsComment = lambda x: [i for i in x if i not in stop] data = pd.read_excel('FinalFeatures.xlsx') comment_index = 0 for index, comment in data.iterrows(): train_data = comment['comment_body'] sentence = stripComment(train_data) sentence = replaceComments(sentence) sentence = splitCommant(sentence) sentence = stopWordsComment(sentence) remove_list = [] for i, word in enumerate(sentence): if '\r\r' in word or word == '': remove_list.append(i) sentence = [ i for j, i in enumerate(sentence) if j not in remove_list ] sentences.append(sentence) labels.append(comment['IsEfficient']) sentences_len.append(len(train_data)) if comment['IsEfficient'] == 1: true_index.append(comment_index) else: false_index.append(comment_index) comment_index += 1 # words = set(itertools.chain(*sentences)) # choose random index for test set true_test_index = random.sample(true_index, 110) false_test_index = random.sample(false_index, 740) # create test and train sets true_test = list(sentences[i] for i in true_test_index) true_label = list(labels[i] for i in true_test_index) false_test = list(sentences[i] for i in false_test_index) false_label = list(labels[i] for i in false_test_index) true_train_index = [ index for index in true_index if index not in true_test_index ] false_train_index = [ index for index in false_index if index not in false_test_index ] true_train = list(sentences[i] for i in true_train_index) true_train_label = list(labels[i] for i in true_train_index) false_train = list(sentences[i] for i in false_train_index) false_train_label = list(labels[i] for i in false_train_index) X_POS = list(itertools.chain(true_train, true_test)) # Y_train = list(itertools.chain(true_train_label, false_train_label)) X_NEG = list(itertools.chain(false_train, false_test)) X_POS = self.labelizeComments(X_POS, 'POS') X_NEG = self.labelizeComments(X_NEG, 'NEG') final_sentences = list(itertools.chain(X_POS, X_NEG)) print('{}: Start calculating Doc2Vec'.format( (time.asctime(time.localtime(time.time()))))) number_of_features = 100 model = Doc2Vec(min_count=2, window=10, size=number_of_features, negative=5, workers=7, iter=55) # documents=final_sentences, model.build_vocab(final_sentences) # print('{}: Start train Doc2Vec'.format( (time.asctime(time.localtime(time.time()))))) for epoch in range(50): # model.train(shuffle(final_sentences)) model.train(final_sentences, total_examples=model.corpus_count, word_count=2) # model.save('d2v100.d2v') # model = Doc2Vec.load('comment.d2v') print('{}: Finish calculating Doc2Vec'.format( (time.asctime(time.localtime(time.time()))))) # Create train numpy data_size = len(sentences) true_size = len(true_train_index) + len(true_test_index) false_size = len(false_train_index) + len(false_test_index) self.data = np.zeros((data_size, number_of_features)) self.labels = np.zeros(data_size) for i in range(true_size): prefix_train_pos = 'POS_' + str(i) self.data[i] = model.docvecs[prefix_train_pos] self.labels[i] = 1 j = 0 for i in range(true_size, true_size + false_size): prefix_train_neg = 'NEG_' + str(j) self.data[i] = model.docvecs[prefix_train_neg] self.labels[i] = -1 j += 1 print(self.labels) # for Non-Negative values - if we want to train Multinumial NB min_max_scale = MinMaxScaler() self.data = min_max_scale.fit_transform(self.data) comments_id = data['comment_id'].values i = 0 w2v_id = [] for sample in self.data: w2v_id_sample = sample.tolist() w2v_id_sample.append(comments_id[i]) w2v_id.append(w2v_id_sample) i += 1 index = range(number_of_features) index.append('comment_id') train_vecs_d2vPD = pd.DataFrame.from_records(w2v_id, columns=index) final_features = pd.merge(data, train_vecs_d2vPD, on='comment_id') final_features.to_csv('100_d2v_scale.csv', encoding='utf-8') return
def iterateOverFeaturesGroups(self): all_groups_results = pd.DataFrame() for number_of_groups in range(1, 6): for groups in itertools.permutations(range(5), number_of_groups): features_group = [self.group_dic[group][0] for group in groups] features = [item for sublist in features_group for item in sublist] self.X_train = self.featuresDF[features] group_names = [self.group_dic[group][1] for group in groups] print('{}: Start training with the groups: {} '.format((time.asctime(time.localtime(time.time()))), group_names)) logging.info('{}: Start training with the groups: {} ' .format((time.asctime(time.localtime(time.time()))), group_names)) group_results = self.ModelsIteration() print('{}: Finish training with the groups: {}' \ .format((time.asctime(time.localtime(time.time()))), group_names)) logging.info('{}: Finish training with the groups: {}' .format((time.asctime(time.localtime(time.time()))), group_names)) # indices = np.arange(len(group_results)) # results = [[x[i] for x in group_results] for i in range(4)] # # # clf_names, score, auc, training_time = results # clf_names = results[0] # score = results[1] # auc = results[2] # training_time = results[3] # training_time = np.array(training_time) / np.max(training_time) # # plt.figure(figsize=(12, 8)) # plt.title("Score") # plt.barh(indices, score, .2, label="score", color='navy') # plt.barh(indices + .3, training_time, .2, label="training time", # color='c') # plt.barh(indices, auc, .2, label="ACU", color='darkorange') # plt.yticks(()) # plt.legend(loc='best') # plt.subplots_adjust(left=.25) # plt.subplots_adjust(top=.95) # plt.subplots_adjust(bottom=.05) # # for i, c in zip(indices, clf_names): # plt.text(-.3, i, c) # # plt.show() # plt.savefig('pythonResults' + group_names + '.png', bbox_inches='tight') for model in group_results: model.append(group_names) model.append(opts.k_fold) columns_names = ['classifier_name', 'score', 'auc', 'train_time', 'group_list', 'k_fold'] group_resultsDF = pd.DataFrame(group_results, columns=columns_names) # group_results.append(group_names).append([opts.k_fold]) all_groups_results = all_groups_results.append(group_resultsDF, ignore_index=True) all_groups_results.to_csv('pythonResultsTemp.csv', encoding='utf-8') # if i == 0: # all_groups_results = group_resultsDF # i += 1 # all_groups_results.to_csv('pythonResultsTemp.csv', encoding='utf-8') # else: # reut = all_groups_results.append(group_resultsDF, ignore_index=True) # all_groups_results.to_csv('pythonResultsTemp.csv', encoding='utf-8') # resultsDF = pd.DataFrame(all_groups_results) all_groups_results.to_csv('pythonResultsFinal.csv', encoding='utf-8') return
def benchmark(self, clf, clf_name='default'): # if I want to train only specific model: # if clf_name != 'MultinomialNB': # print('Not training') # return ['not training', 0, 0, 0] print('_' * 80) print('{}: Traininig: {}'.format( (time.asctime(time.localtime(time.time()))), clf)) logging.info('_' * 80) logging.info('{}: Traininig: {}'.format( (time.asctime(time.localtime(time.time()))), clf)) # Cross validation part if clf_name == 'GaussianNB': self.X_train = self.X_train.toarray() t1 = time.time() score = [] auc = [] for out_group in range(opts.k_fold): t0 = time.time() # create train and test data test_data = self.X_train.loc[self.X_train['group_number'] == out_group][self.features] test_label = self.labels.loc[self.X_train['group_number'] == out_group]['IsEfficient'] train_data = self.X_train.loc[ self.X_train['group_number'] != out_group][self.features] train_label = self.labels.loc[ self.X_train['group_number'] != out_group]['IsEfficient'] # train the model clf.fit(train_data, train_label) predicted = clf.predict(test_data) score.append(metrics.accuracy_score(test_label, predicted)) auc.append( metrics.roc_auc_score(test_label, predicted, average='samples')) # print('fold number {}: accuracy: {}, AUC: {}'.format(out_group, metrics.accuracy_score(test_label, # predicted), # metrics.roc_auc_score(test_label, predicted, # average='samples'))) logging.info("Fold number:") logging.info(out_group) logging.info("accuracy:") logging.info(metrics.accuracy_score(test_label, predicted)) logging.info("AUC:") logging.info( metrics.roc_auc_score(test_label, predicted, average='samples')) if opts.print_cm: # print("confusion matrix:") # print(metrics.confusion_matrix(test_label, predicted, labels=[-1, 1])) logging.info("confusion matrix:") logging.info( metrics.confusion_matrix(test_label, predicted, labels=[-1, 1])) train_time = time.time() - t0 # print("fold number {}: cross validation time: {}".format(out_group, train_time)) logging.info("cross validation time: {}".format(train_time)) # clf_descr = str(clf).split('(')[0] average_acc = sum(score) / len(score) print("Average Accuracy: {}".format(average_acc)) logging.info("Average Accuracy: {})".format(average_acc)) average_auc = sum(auc) / len(auc) print("Average AUC: {}".format(average_auc)) logging.info('Average AUC: {}'.format(average_auc)) train_time = time.time() - t1 return [clf_name, average_acc, average_auc, train_time]
def iterateOverFeaturesGroups(self, Peff_up_threshold, Peff_down_threshold): all_groups_results = pd.DataFrame() remaining_features = list(self.group_dic.keys()) if opts.is_backward: # use backward elimination selected_features = list(self.group_dic.keys()) else: # use forward selection selected_features = [] remaining_features = [ x for x in remaining_features if x not in selected_features ] current_auc, best_new_auc = 0.0, 0.0 remain_number_of_candidate = len(remaining_features) while remaining_features and current_auc == best_new_auc and remain_number_of_candidate > 0: auc_with_candidates = list() for candidate in remaining_features: if opts.is_backward: # use backward elimination features_group = [ self.group_dic[group][0] for group in selected_features ] features_group.remove(self.group_dic[candidate][0]) self.features = [ item for sublist in features_group for item in sublist ] features = [ item for sublist in features_group for item in sublist ] features.append('group_number') self.X_train = self.featuresDF[features] features_names = [ self.group_dic[feature][1] for feature in selected_features ] features_names.remove(self.group_dic[candidate][1]) else: # use forward selection features_group = [self.group_dic[group][0] for group in selected_features] +\ [self.group_dic[candidate][0]] self.features = [ item for sublist in features_group for item in sublist ] features = [ item for sublist in features_group for item in sublist ] features.append('group_number') self.X_train = self.featuresDF[features] features_names = [self.group_dic[feature][1] for feature in selected_features] +\ [self.group_dic[candidate][1]] print('{}: Start training with the groups: {} '.format( (time.asctime(time.localtime(time.time()))), features_names)) logging.info('{}: Start training with the groups: {} '.format( (time.asctime(time.localtime(time.time()))), features_names)) group_results = self.ModelsIteration() best_auc = max(result[2] for result in group_results) auc_with_candidates.append((best_auc, candidate)) print('{}: Finish training with the groups: {}'.format( (time.asctime(time.localtime(time.time()))), features_names)) logging.info('{}: Finish training with the groups: {}'.format( (time.asctime(time.localtime(time.time()))), features_names)) for model in group_results: model.append(features_names) model.append(opts.k_fold) model.append(Peff_up_threshold) model.append(Peff_down_threshold) columns_names = [ 'classifier_name', 'score', 'auc', 'train_time', 'features_list', 'k_fold', 'Peff_up_threshold', 'Peff_down_threshold' ] group_resultsDF = pd.DataFrame(group_results, columns=columns_names) # group_results.append(group_names).append([opts.k_fold]) all_groups_results = all_groups_results.append( group_resultsDF, ignore_index=True) all_groups_results.to_csv('test_results_stepwise.csv', encoding='utf-8') auc_with_candidates.sort() best_new_auc, best_candidate = auc_with_candidates.pop() if current_auc <= best_new_auc: if opts.is_backward: # use backward elimination selected_features.remove(best_candidate) else: # use forward selection selected_features.append(best_candidate) remaining_features.remove(best_candidate) current_auc = best_new_auc else: logging.info( '{}: No candidate was chosen for threshold: {} and {}, number of selected features is {}.' .format((time.asctime(time.localtime(time.time()))), Peff_down_threshold, Peff_up_threshold, len(selected_features))) print( '{}: No candidate was chosen for threshold: {} and {}, number of selected features is {}.' .format((time.asctime(time.localtime(time.time()))), Peff_down_threshold, Peff_up_threshold, len(selected_features))) # one candidate can be chosen, if not- we go forward to the next step. remain_number_of_candidate -= 1 selected_features_names = [ self.group_dic[feature][1] for feature in selected_features ] logging.info( '{}: Selected features for threshold: {} and {} are: {} and the best AUC is: {}' .format((time.asctime(time.localtime(time.time()))), Peff_down_threshold, Peff_up_threshold, selected_features_names, best_new_auc)) print( '{}: Selected features for threshold: {} and {} are: {} and the best AUC is: {}.' .format((time.asctime(time.localtime(time.time()))), Peff_down_threshold, Peff_up_threshold, selected_features_names, best_new_auc)) return all_groups_results
def benchmark(self, clf, clf_name='default'): """ This function train and test the model (clf) opts.k_fold time with CV :param clf: the model to train and test :param str clf_name: the name of the model :return: clf_name, average_acc, average_auc, train_time of the model :rtype list """ print('_' * 80) print('{}: Traininig: {}'.format( (time.asctime(time.localtime(time.time()))), clf)) logging.info('_' * 80) logging.info('{}: Traininig: {}'.format( (time.asctime(time.localtime(time.time()))), clf)) # Cross validation part if clf_name == 'GaussianNB': self.X_train = self.X_train.toarray() t1 = time.time() score = [] auc = [] for out_group in range(1, opts.k_fold): t0 = time.time() # create train and test data test_data = self.X_train.loc[self.X_train['group_number'] == out_group, self.features] test_label = self.labels.loc[self.X_train['group_number'] == out_group, self.label_column_name] train_data = self.X_train.loc[ self.X_train['group_number'] != out_group, self.features] train_label = self.labels.loc[ self.X_train['group_number'] != out_group, self.label_column_name] # train the model clf.fit(train_data, train_label) predicted = clf.predict(test_data) score.append(metrics.accuracy_score(test_label, predicted)) auc.append( metrics.roc_auc_score(test_label, predicted, average='samples')) logging.info("Fold number:") logging.info(out_group) logging.info("accuracy:") logging.info(metrics.accuracy_score(test_label, predicted)) logging.info("AUC:") logging.info( metrics.roc_auc_score(test_label, predicted, average='samples')) if opts.print_cm: print("confusion matrix:") print( metrics.confusion_matrix(test_label, predicted, labels=[0, 1])) logging.info("confusion matrix:") logging.info( metrics.confusion_matrix(test_label, predicted, labels=[0, 1])) train_time = time.time() - t0 # print("fold number {}: cross validation time: {}".format(out_group, train_time)) logging.info("cross validation time: {}".format(train_time)) # clf_descr = str(clf).split('(')[0] average_acc = sum(score) / len(score) print("Average Accuracy: {}".format(average_acc)) logging.info("Average Accuracy: {})".format(average_acc)) average_auc = sum(auc) / len(auc) print("Average AUC: {}".format(average_auc)) logging.info('Average AUC: {}'.format(average_auc)) train_time = time.time() - t1 return [clf_name, average_acc, average_auc, train_time]
def main(only_subreddit_similarity=False, only_percent=False): print('{}: Loading the data'.format((time.asctime(time.localtime(time.time()))))) create_features = CreateFeatures() print('{}: Finish loading the data'.format((time.asctime(time.localtime(time.time()))))) print('data sizes: all data: {}, ref data: {}, classify ref data: {} '.format(create_features.all_data.shape, create_features.references.shape, create_features.classify_ref.shape)) if opts.pickel_not_saved: create_features.create_subreddit_data() else: with open('subreddit_dict.pickle', 'rb') as handle: create_features.subreddit_dict = pickle.load(handle) all_comments_features = list() for index, comment in create_features.classify_ref.iterrows(): if index % 100 == 0: print('{}: Finish calculate {} samples'.format((time.asctime(time.localtime(time.time()))), index)) comment_author = comment['comment_author'] original_subreddit = comment['subreddit'] recommend_subreddit = comment['recommend_subreddit'] if opts.use_date_threshold: # if we use the data threshold - use the comment time, else use the current time. comment_time = comment['comment_created_time'] submission_time = comment['submission_created_time'] else: comment_time = datetime.utcnow() submission_time = datetime.utcnow() if only_subreddit_similarity: subreddits_similarity = create_features.tfifd_similarity(original_subreddit, recommend_subreddit) featuresDF = pd.Series(subreddits_similarity) elif only_percent: number_of_efficient_references_comment_author = \ create_features.number_of_efficient_references(comment_author, comment_time) number_of_checked_references = create_features.number_of_checked_references(comment_author, comment_time) if number_of_checked_references > 0: percent_efficient_references_comment_author = (100.0 * number_of_efficient_references_comment_author) / \ number_of_checked_references else: percent_efficient_references_comment_author = 0 # print('percent_efficient_references_comment_author is 0 for comment ID: {}'.format(comment['comment_id'])) featuresDF = pd.Series(percent_efficient_references_comment_author) else: # Calculate similarity between the original and recommended subreddits: subreddits_similarity = create_features.tfifd_similarity(original_subreddit, recommend_subreddit) # Get comment author features: comment_author_number_original_subreddit, comment_author_number_recommend_subreddit, \ comment_author_subreddit_list = create_features.number_list_of_message(original_subreddit, recommend_subreddit, comment_author, comment_time) number_of_references_comment_author = create_features.number_of_references(comment_author, comment_time) # print('{}: comment ID: {}, number_of_references_comment_author: {}'\ # .format((time.asctime(time.localtime(time.time()))), comment['comment_id'], # number_of_references_comment_author)) number_of_efficient_references_comment_author = \ create_features.number_of_efficient_references(comment_author, comment_time, is_efficient=1) number_of_inefficient_references_comment_author = \ create_features.number_of_efficient_references(comment_author, comment_time, is_efficient=-1) number_of_checked_references = create_features.number_of_checked_references(comment_author, comment_time) if number_of_checked_references > 0: percent_efficient_references_comment_author = (100.0 * number_of_efficient_references_comment_author) / \ number_of_checked_references else: percent_efficient_references_comment_author = 0 # print('percent_efficient_references_comment_author is 0 for comment ID: {}'.format(comment['comment_id'])) # Get submission author features: submission_author = comment['submission_author'] submission_author_number_original_subreddit, submission_author_number_recommend_subreddit, \ submission_author_subreddit_list = create_features.number_list_of_message(original_subreddit, recommend_subreddit, submission_author, submission_time) # Similarity between comment and submission authors subreddits lists: cosine_similarity_subreddits_list = get_cosine(Counter(comment_author_subreddit_list), Counter(submission_author_subreddit_list)) # Get the hour of the comment and the submission: comment_created_time_hour = convert_utc(comment['comment_created_time']).hour submission_created_time_hour = convert_utc(comment['submission_created_time']).hour # Get the time between the submission was published and the comment time: time_to_comment = comment['time_to_comment'] time_between_messages_hour = math.floor(time_to_comment/3600.0) time_between_messages_min = math.floor((time_to_comment - 3600*time_between_messages_hour)/60.0)/100.0 time_between_messages = time_between_messages_hour + time_between_messages_min # Comment features: comment_body = comment['comment_body'] submission_body = comment['submission_body'] submission_title = comment['title'] comment_len, number_of_r = number_of_subreddits(comment_body, '/r/') if isinstance(submission_body, str) and isinstance(comment_body, str): comment_submission_similarity = create_features.tfifd_similarity([comment_body, submission_body]) else: comment_submission_similarity = 0.0 if isinstance(submission_title, str) and isinstance(comment_body, str): comment_title_similarity = create_features.tfifd_similarity([comment_body, submission_title]) else: comment_title_similarity = 0.0 number_of_references_to_submission = comment['num_comments'] # subreddit features: number_of_references_to_recommended_subreddit = create_features.popular_subreddit(recommend_subreddit, comment_time) features = [comment_author_number_original_subreddit, comment_author_number_recommend_subreddit, percent_efficient_references_comment_author, number_of_references_comment_author, number_of_efficient_references_comment_author, number_of_inefficient_references_comment_author, submission_author_number_original_subreddit, submission_author_number_recommend_subreddit, cosine_similarity_subreddits_list, comment_created_time_hour, submission_created_time_hour, time_between_messages, comment_len, number_of_r, comment_submission_similarity, comment_title_similarity, number_of_references_to_submission, number_of_references_to_recommended_subreddit, subreddits_similarity] labels = ('comment_author_number_original_subreddit', 'comment_author_number_recommend_subreddit', 'percent_efficient_references_comment_author', 'number_of_references_comment_author', 'number_of_efficient_references_comment_author', 'number_of_inefficient_references_comment_author', 'submission_author_number_original_subreddit', 'submission_author_number_recommend_subreddit', 'cosine_similarity_subreddits_list', 'comment_created_time_hour', 'submission_created_time_hour', 'time_between_messages', 'comment_len', 'number_of_r', 'comment_submission_similarity', 'comment_title_similarity', 'number_of_references_to_submission', 'number_of_references_to_recommended_subreddit', 'subreddits_similarity') featuresDF = pd.Series(features, index=labels) comment_features = comment.append(featuresDF) if only_subreddit_similarity: comment_features.rename(columns={'0': 'subreddits_similarity'}, inplace=True) elif only_percent: comment_features.rename(columns={'0': 'percent_efficient_references_comment_author'}, inplace=True) if index == 0: all_comments_features = comment_features # print('{}: Finish calculate first samples'.format((time.asctime(time.localtime(time.time()))))) else: all_comments_features = pd.concat([comment_features, all_comments_features], axis=1) all_comments_features.T.to_csv('Features_with_commnent_time.csv', encoding='utf-8') # export the data to csv file all_comments_features.T.to_csv('FinalFeatures_with_comment_time2.csv', encoding='utf-8')
def __init__(self, label_column_name): self.X_train = None self.features = None self.feature_names = None print('{}: Loading the data: final_features_causality'.format( (time.asctime(time.localtime(time.time()))))) self.labels = None self.featuresDF = pd.read_csv( os.path.join( features_directory, 'matches_data_frame_treated_propensity_score_treated_logistic_all_deltas.csv' )) self.label_column_name = label_column_name # group_dict is in the format: {index: [features list of this group], group name self.group_dict = { 0: [[ 'commenter_number_submission', 'commenter_number_comment', 'number_of_comments_in_tree_by_comment_user', 'commenter_seniority_days' ], 'commenter_features'], 1: [[ 'submitter_number_submission', 'submitter_seniority_days', 'submitter_number_comment', 'number_of_comments_in_tree_from_submitter', 'number_of_respond_by_submitter_total', 'number_of_respond_by_submitter' ], 'submitter_features'], 2: [['is_first_comment_in_tree', 'comment_len', 'comment_depth'], 'comment_features'], 3: [[ 'time_ratio', 'time_between_messages', 'time_until_first_comment', 'time_between_comment_first_comment' ], 'time_features'], 4: [['submission_len', 'title_len'], 'submission_features'], 5: [[ 'respond_to_comment_user_responses_ratio', 'respond_to_comment_user_all_ratio', 'respond_total_ratio' ], 'ratio_features'], 6: [['treated'], 'trated'], 7: [[ 'nltk_com_sen_pos', 'nltk_com_sen_neg', 'nltk_com_sen_neutral', 'nltk_sub_sen_pos', 'nltk_sub_sen_neg', 'nltk_sub_sen_neutral', 'nltk_title_sen_pos', 'nltk_title_sen_neg', 'nltk_title_sen_neutral', 'nltk_sim_sen' ], 'sentiment features'], 8: [['percent_adj'], 'percent_adj'], 9: [['submmiter_commenter_tfidf_cos_sim'], 'submitted_commenter_similarity'], 10: [[ 'topic_model_0', 'topic_model_1', 'topic_model_2', 'topic_model_3', 'topic_model_4', 'topic_model_5', 'topic_model_6', 'topic_model_7', 'topic_model_8', 'topic_model_9', 'topic_model_10', 'topic_model_11', 'topic_model_12', 'topic_model_13', 'topic_model_14' ], 'topic_model'] } print('{}: Data loaded '.format( (time.asctime(time.localtime(time.time()))))) return
def __init__(self, message='', devID=0xFF, tstamp=time.asctime()): self.tstamp = tstamp self.devID = devID self.message = message
def main(): dt_obj = time.asctime(time.localtime(time.time())) return render_template(MAIN_HTML, **locals())
def _tweet ( self, message ): if self.logger<>None: import time self.logger.write ("%s - [ %s ]\n"%(time.asctime(), message )) self.logger.flush()
def mavlink_packet(self, m): '''handle mavlink packets''' mtype = m.get_type() if mtype == 'HEARTBEAT': self.time_at_last_heartbeat = time() elif mtype == 'GLOBAL_POSITION_INT': if self.settings.target_system == 0 or self.settings.target_system == m.get_srcSystem( ): self.gps_update(m) elif mtype == 'SCALED_PRESSURE3': self.psensor_update(m) elif mtype == 'SCALED_PRESSURE': self.dsensor_update(m) elif mtype == "SYS_STATUS": self.battery_update(m) elif mtype == 'RC_CHANNELS_RAW': self.rc_update(m) elif mtype == 'SERVO_OUTPUT_RAW': self.servo_update(m) elif mtype == 'MAV_STATE_CRITICAL': self.mav_state_critical = True elif mtype == 'MAV_STATE_EMERGENCY': self.mav_state_emergency = True elif mtype in ['WAYPOINT_COUNT', 'MISSION_COUNT']: if self.wp_op is None: self.console.error("No waypoint load started") else: self.wploader.clear() self.wploader.expected_count = m.count self.console.writeln( "Requesting %u waypoints t=%s now=%s" % (m.count, time.asctime(time.localtime( m._timestamp)), time.asctime())) self.send_wp_requests() elif mtype in ['WAYPOINT', 'MISSION_ITEM'] and self.wp_op is not None: if m.seq < self.wploader.count(): # print("DUPLICATE %u" % m.seq) return if m.seq + 1 > self.wploader.expected_count: self.console.writeln( "Unexpected waypoint number %u - expected %u" % (m.seq, self.wploader.count())) self.wp_received[m.seq] = m next_seq = self.wploader.count() while next_seq in self.wp_received: m = self.wp_received.pop(next_seq) self.wploader.add(m) next_seq += 1 if self.wploader.count() != self.wploader.expected_count: # print("m.seq=%u expected_count=%u" % (m.seq, self.wploader.expected_count)) self.send_wp_requests() return if self.wp_op == 'list': for i in range(self.wploader.count()): w = self.wploader.wp(i) print( "%u %u %.10f %.10f %f p1=%.1f p2=%.1f p3=%.1f p4=%.1f cur=%u auto=%u" % (w.command, w.frame, w.x, w.y, w.z, w.param1, w.param2, w.param3, w.param4, w.current, w.autocontinue)) if self.logdir is not None: waytxt = os.path.join(self.logdir, 'way.txt') self.save_waypoints(waytxt) print("Saved waypoints to %s" % waytxt) elif self.wp_op == "save": self.save_waypoints(self.wp_save_filename) self.wp_op = None self.wp_requested = {} self.wp_received = {} elif mtype in ["WAYPOINT_REQUEST", "MISSION_REQUEST"]: self.process_waypoint_request(m, self.master) elif mtype in ["WAYPOINT_CURRENT", "MISSION_CURRENT"]: if m.seq != self.last_waypoint: self.last_waypoint = m.seq if self.settings.wpupdates: self.say("waypoint %u" % m.seq, priority='message') elif mtype == "MISSION_ITEM_REACHED": wp = self.module('wp').wploader.wp(m.seq) if wp is None: # should we spit out a warning?! # self.say("No waypoints") self.next_wp = None pass else: if wp.command == mavutil.mavlink.MAV_CMD_DO_LAND_START: alt_offset = self.get_mav_param('ALT_OFFSET', 0) if alt_offset > 0.005: self.say( "ALT OFFSET IS NOT ZERO passing DO_LAND_START") self.next_wp = wp elif m.get_type() == "FENCE_STATUS": self.module('fence').last_fence_breach = m.breach_time self.module('fence').last_fence_status = m.breach_status elif m.get_type() in ['SYS_STATUS']: bits = mavutil.mavlink.MAV_SYS_STATUS_GEOFENCE present = ((m.onboard_control_sensors_present & bits) == bits) if self.module('fence').present is False and present is True: self.say("fence present") elif self.module('fence').present is True and present is False: self.say("fence removed") self.present = present enabled = ((m.onboard_control_sensors_enabled & bits) == bits) if self.module('fence').enabled is False and enabled is True: self.say("fence enabled") elif self.module('fence').enabled is True and enabled is False: self.say("fence disabled") self.module('fence').enabled = enabled healthy = ((m.onboard_control_sensors_health & bits) == bits) if self.module('fence').healthy is False and healthy is True: self.say("fence OK") elif self.module('fence').healthy is True and healthy is False: self.say("fence breach") self.module('fence').healthy = healthy # console output for fence: if self.module('fence').enabled is False: self.module('fence').console.set_status('Fence', 'FEN', row=0, fg='grey') elif self.module('fence').enabled is True and self.module( 'fence').healthy is True: self.console.set_status('Fence', 'FEN', row=0, fg='green') elif self.module('fence').enabled is True and self.module( 'fence').healthy is False: self.console.set_status('Fence', 'FEN', row=0, fg='red') return
def command_log(*args): if len(args) > 2: raise Exception( "Too many arguments: [ip] [time period in s] (optional parameter)") ip = '' max_ago = float('inf') if len(args) >= 1: ip = args[0] if len(args) == 2: max_ago = float(args[1]) from pyparsing import Word, alphas, Suppress, Combine, nums, string, Optional, Regex, ParseException # define line in (sys)log month = Word(string.uppercase, string.lowercase, exact=3) integer = Word(nums) serverDateTime = Combine(month + " " + integer + " " + integer + ":" + integer + ":" + integer) hostname = Word(alphas + nums + "_" + "-") daemon = Word(alphas + nums + "/" + "-" + "_") + Optional(Suppress("[") + integer + Suppress("]")) + Suppress(":") message = Regex(".*") bnf = serverDateTime + hostname + daemon + message from collections import deque import re, time last_access = {} tail_n = 100 for line in deque(open(logfile_path), tail_n): try: fields = bnf.parseString(line) except ParseException: continue else: m = re.search('requests (\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})', fields[-1]) if m: #print fields[0], m.group(1) cur = time.localtime() # guess year... st = time.strptime(fields[0] + " %s" % cur.tm_year, "%b %d %H:%M:%S %Y") if st > cur: # ok, re-guess st = time.strptime(fields[0] + " %s" % (cur.tm_year - 1), "%b %d %H:%M:%S %Y") if (st > cur): raise Exception("HMF logfile seems too old!?!") last_access[m.group(1)] = st ips = [key for key in last_access.keys() if ip in key] access_in_period = [((time.mktime(cur) - time.mktime(t)) <= max_ago) for r, t in last_access.items()] if ips and any(access_in_period): print "Previous accesses:" for (resource, timestamp), state in zip(last_access.items(), access_in_period): if not state: continue if not resource in ips: continue print "\t%s was accessed on %s (%.1fs ago)" % ( resource, time.asctime(timestamp), time.mktime(cur) - time.mktime(timestamp)) return EXIT_FAILURE return EXIT_SUCCESS
def main(): topics_number = 15 print('{}: Loading the data'.format((time.asctime(time.localtime(time.time()))))) create_features = CreateFeatures(topics_number) print('{}: Finish loading the data'.format((time.asctime(time.localtime(time.time()))))) print('data sizes: all data: {}, units data: {}'.format(create_features.all_data.shape, create_features.units.shape)) # Features calculated for all the data frame: topic_model_result = create_features.topic_model() create_features.units = create_features.units.merge(topic_model_result, on='comment_id') create_features.units['comment_len'] = create_features.units['comment_body'].str.len() create_features.units['submission_len'] = create_features.units['submission_body'].str.len() create_features.units['title_len'] = create_features.units['submission_title'].str.len() new_index = 0 number_of_treatment_minus_1 = 0 for index, comment in create_features.units.iterrows(): if new_index % 100 == 0: print('{}: Finish calculate {} samples'.format((time.asctime(time.localtime(time.time()))), new_index)) comment_author = copy(comment['comment_author']) comment_time = copy(comment['comment_created_utc']) submission_time = copy(comment['submission_created_utc']) submission_id = copy(comment['submission_id']) submission_num_comments = copy(comment['submission_num_comments']) comment_body = copy(comment['comment_body']) submission_body = copy(comment['submission_body']) title = copy(comment['submission_title']) # treatment: is_quote = create_features.loop_over_comment_for_quote(comment, comment_body) if is_quote != -1: create_features.units.loc[index, 'treated'] = is_quote else: print('{}: treatment = -1'.format((time.asctime(time.localtime(time.time()))))) number_of_treatment_minus_1 += 1 continue # Get comment author features: # print('{}: Get comment author features'.format((time.asctime(time.localtime(time.time()))))) create_features.units.loc[index, 'commenter_number_submission'] =\ create_features.number_of_message(comment_author, comment_time, 'submission') create_features.units.loc[index, 'commenter_number_comment'] =\ create_features.number_of_message(comment_author, comment_time, 'comment') create_features.units.loc[index, 'commenter_seniority_days'] =\ create_features.calculate_user_seniority(comment_author) # Get submission author features: # print('{}: Get submission author features'.format((time.asctime(time.localtime(time.time()))))) submission_author = comment['submission_author'] create_features.units.loc[index, 'submitter_number_submission']\ = create_features.number_of_message(submission_author, comment_time, 'submission') create_features.units.loc[index, 'submitter_number_comment']\ = create_features.number_of_message(submission_author, comment_time, 'comment') create_features.units.loc[index, 'submitter_seniority_days'] =\ create_features.calculate_user_seniority(submission_author) create_features.units.loc[index, 'is_first_comment_in_tree'],\ create_features.units.loc[index, 'number_of_comments_in_tree_by_comment_user'], _, _ = \ create_features.comment_in_tree(comment_author, comment_time, submission_id) # Get the time between the submission and the comment time and the ration between the first comment: # print('{}: Get the time between the submission and the comment time and the ration between the first comment' # .format((time.asctime(time.localtime(time.time()))))) time_to_comment = comment['time_between'] time_between_messages_hour = math.floor(time_to_comment/3600.0) time_between_messages_min = math.floor((time_to_comment - 3600*time_between_messages_hour)/60.0)/100.0 create_features.units.loc[index, 'time_between_messages'] =\ time_between_messages_hour + time_between_messages_min time_until_first_comment, time_between_comment_first_comment =\ create_features.time_to_first_comment(submission_id, submission_time, comment_time) if time_to_comment > 0: create_features.units.loc[index, 'time_ratio'] = time_until_first_comment/time_to_comment else: create_features.units.loc[index, 'time_ratio'] = 0 create_features.units.loc[index, 'time_until_first_comment'] = time_until_first_comment create_features.units.loc[index, 'time_between_comment_first_comment'] = time_between_comment_first_comment # Get the numbers of comments by the submitter _, create_features.units.loc[index, 'number_of_comments_in_tree_from_submitter'],\ number_of_respond_by_submitter, number_of_respond_by_submitter_total =\ create_features.comment_in_tree(submission_author, comment_time, submission_id, comment_author, True) create_features.units.loc[index, 'number_of_respond_by_submitter'],\ create_features.units.loc[index, 'number_of_respond_by_submitter_total'] \ = number_of_respond_by_submitter, number_of_respond_by_submitter_total # Ratio of comments number: # print('{}: Ratio of comments number'.format((time.asctime(time.localtime(time.time()))))) if submission_num_comments == 0: create_features.units.loc[index, 'respond_to_comment_user_all_ratio'] = 0 create_features.units.loc[index, 'respond_total_ratio'] = 0 else: create_features.units.loc[index, 'respond_to_comment_user_all_ratio'] =\ number_of_respond_by_submitter / submission_num_comments create_features.units.loc[index, 'respond_total_ratio'] =\ number_of_respond_by_submitter_total / submission_num_comments if number_of_respond_by_submitter_total == 0: create_features.units.loc[index, 'respond_to_comment_user_responses_ratio'] = 0 else: create_features.units.loc[index, 'respond_to_comment_user_responses_ratio'] =\ number_of_respond_by_submitter / number_of_respond_by_submitter_total # Sentiment analysis: # for the comment: print('{}: Sentiment analysis'.format((time.asctime(time.localtime(time.time()))))) comment_sentiment_list = sentiment_analysis(comment_body) create_features.units.loc[index, 'nltk_com_sen_pos'], create_features.units.loc[index, 'nltk_com_sen_neg'], \ create_features.units.loc[index, 'nltk_com_sen_neutral'] = \ comment_sentiment_list[0], comment_sentiment_list[1], comment_sentiment_list[2] # for the submission: sub_sentiment_list = sentiment_analysis(submission_body) create_features.units.loc[index, 'nltk_sub_sen_pos'], create_features.units.loc[index, 'nltk_sub_sen_neg'],\ create_features.units.loc[index, 'nltk_sub_sen_neutral'] = \ sub_sentiment_list[0], sub_sentiment_list[1], sub_sentiment_list[2] # for the title title_sentiment_list = sentiment_analysis(title) create_features.units.loc[index, 'nltk_title_sen_pos'], create_features.units.loc[index, 'nltk_title_sen_neg'], \ create_features.units.loc[index, 'nltk_title_sen_neutral'] = \ title_sentiment_list[0], title_sentiment_list[1], title_sentiment_list[2] # cosine similarity between submission's sentiment vector and comment sentiment vector: sentiment_sub = np.array(sub_sentiment_list).reshape(1, -1) sentiment_com = np.array(comment_sentiment_list).reshape(1, -1) create_features.units.loc[index, 'nltk_sim_sen'] = cosine_similarity(sentiment_sub, sentiment_com)[0][0] # percent of adjective in the comment: # print('{}: percent of adjective in the comment'.format((time.asctime(time.localtime(time.time()))))) create_features.units.loc[index, 'percent_adj'] = percent_of_adj(comment_body) new_index += 1 # export the data to csv file create_features.units.T.to_csv(os.path.join(data_directory, 'features_CMV.csv'), encoding='utf-8') print('number_of_treatment_minus_1: ', number_of_treatment_minus_1)
def RunJob(job, joblist, source): # Set up job attributes -- working dir, environment, killstatus # Open logfile logname = "" log = "" try: logname = os.path.join(job.workdir, "dispatch-"+str(random.randint(100000000,999999999))+".log") log = open(logname, "w") except: # Couldn't cd to workdir, or couldn't open logfile. Die. return 3, None # Set up the environment envvars = os.environ.copy() for key,val in job.env.iteritems(): if key=="PATH_PREFIX": envvars['PATH'] = val + envvars['PATH'] else: envvars[key] = val print "\n# Environment PATH:", envvars['PATH'] # Spawn the process child = subprocess.Popen(job.cmd,cwd=job.workdir, env=envvars, shell=True, stdout=log, stderr=log) wait = 0 rtncode = None while (rtncode == None): try: time.sleep(1) rtncode = child.poll() # Check for kill request if (wait % 10 == 0): wait = 0 if (joblist.killMe() == True): print "Got Kill Request" kill = subprocess.Popen("taskkill /F /T /PID %i" % child.pid, shell=True) rtncode = 0 break wait += 1 except: print "Lost connection: Killing job!" kill = subprocess.Popen("taskkill /F /T /PID %i" % child.pid, shell=True) rtncode = 0 break # Done! Close things out. # Concatenate logfiles (bug in python.. drat!) log.close() log = open(logname, "r") # Using a threadsafe lock function so that multiple threads can append output # to the logfile without tripping on each other. LOGGERLOCK.acquire() logfile = open(os.path.join(job.workdir,LOGGERNAME),"a") for line in log: logfile.write(line) logfile.write("======= Finished "+time.asctime()+" ==============================\n") # Close out the logfiles and set to null so windows can delete them. log.flush() log.close() logfile.flush() logfile.close() log=None logfile=None try: os.remove(logname) except: pass # print sys.exc_info() sometimes Windows doesn't release the logfile... :-( LOGGERLOCK.release() return rtncode, logname
def diagbtn2_fun(self): file1 = open(self.le_url.text(), 'r') text1 = file1.read() file1.close() features = self.extract_data(text1) self.pb.show() self.pb.setValue(0) self.completed = 0 res = svm_breast.predict(features) while self.completed <= 100: self.completed += 0.0075 self.pb.setValue(self.completed) self.pb.hide() self.restb.setText("") localtime = time.asctime(time.localtime(time.time())) url = "logs/" + self.le1.text() + localtime + ".txt" file1 = open(url, 'w+') file1.write("---------Diagnosed By :Dr." + self.le1.text() + " On " + localtime + "----------------\n") x = 1 for ans in res: file1.write("\nId:" + str(x)) file1.write("\nClump Thickness: " + str(features[x - 1][0])) file1.write("\nUniformity of Cell Size: " + str(features[x - 1][1])) file1.write("\nUniformity of Cell Shape: " + str(features[x - 1][2])) file1.write("\nMarginal Adhesion: " + str(features[x - 1][3])) file1.write("\nSingle Epithelial Cell Size: " + str(features[x - 1][4])) file1.write("\nBare Nuclei: " + str(features[x - 1][5])) file1.write("\nBland Chromatin: " + str(features[x - 1][6])) file1.write("\nNormal Nucleoli: " + str(features[x - 1][7])) file1.write("\nMitoses: " + str(features[x - 1][8])) if ans == '4': self.restb.append("Id: " + str(x) + "\t\t Result: Malignant ") file1.write("\nResult: Malignant\n") else: self.restb.append("Id: " + str(x) + "\t\t Result: Benign ") file1.write("\nResult: Benign\n") x += 1 self.restb.append( "-----------------------------------------------------------------------------------------------------" ) self.restb.append("Diagnosed By: Dr. " + self.le1.text()) url1 = " '//home//karan//Desktop//Breast Cancer Detection Project//UI//logs//" self.logurl = url1 + self.le1.text() + localtime + ".txt' " self.logbtn.show()