def run(self): i = 0 self.new_labels = [] for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences): new_labels = self.hmm_new.decode(raw_segment)[1] self.new_labels.append(new_labels) tokens = Tokens(raw_segment).tokens feature_vectors = FeatureGenerator(raw_segment).features print i, ': ', raw_segment for token, old_label, new_label, feature_vector in zip(tokens, label_sequence, new_labels, feature_vectors): print to_label(old_label), '\t', to_label(new_label), '\t', token self.feature_entity_list.add_entity(feature_vector, old_label, token) #???? Old label first print '\n' i+=1
def run(self): i = 0 self.new_labels = [] for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences): new_labels = self.hmm_new.decode(raw_segment)[1] self.new_labels.append(new_labels) tokens = Tokens(raw_segment).tokens feature_vectors = FeatureGenerator(raw_segment).features print i, ': ', raw_segment for token, old_label, new_label, feature_vector in zip( tokens, label_sequence, new_labels, feature_vectors): print to_label(old_label), '\t', to_label( new_label), '\t', token self.feature_entity_list.add_entity( feature_vector, old_label, token) #???? Old label first print '\n' i += 1
def create_dataset(self, dataset_type="graph", standardize_features=True, on_gpu=False, oversampling_ratio=1): """ Args: dataset_type:str. Has to be "graph", "sequential" or "raw" Returns: dict with keys "train", "val", "test": If dataset_type is "graph" contains list of torch_geometric.data.Data(x=x, y=y, edge_index=edge_index) If dataset_type is "sequential" contains list of (sequential_data, y) """ if dataset_type not in ["graph", "sequential", "raw"]: raise ValueError("Supported dataset types are: 'graph', 'sequential', 'raw'.") start_time = time.time() trees_to_parse = utils.get_tree_file_names(self.dataset_dir) labels = self.load_labels() # Create train-val-test split # Remove useless trees (i.e. with labels that we don't consider) news_ids_to_consider = list(labels.keys()) if self.only_binary: news_ids_to_consider = [news_id for news_id in news_ids_to_consider if labels[news_id] in ['false', 'true']] train_ids, val_ids = train_test_split(news_ids_to_consider, test_size=0.1, random_state=self.seed) train_ids, test_ids = train_test_split(train_ids, test_size=0.25, random_state=self.seed*7) print(f"Len train/val/test {len(train_ids)} {len(val_ids)} {len(test_ids)}") user_ids_in_train, tweet_ids_in_train = \ self.get_user_and_tweet_ids_in_train(trees_to_parse, train_ids) tweet_features = self.load_tweet_features() user_features = self.load_user_features() if standardize_features: print("Standardizing features") preprocessed_tweet_fts = self.preprocess_tweet_features(tweet_features, tweet_ids_in_train) preprocessed_user_fts = self.preprocess_user_features(user_features, user_ids_in_train, standardize_features) # basic_tests.test_user_preprocessed_features(preprocessed_user_fts) ids_to_dataset = {news_id: 'train' for news_id in train_ids} ids_to_dataset.update({news_id: 'val' for news_id in val_ids}) ids_to_dataset.update({news_id: 'test' for news_id in test_ids}) dataset = {'train': [], 'val': [], 'test': []} trees = [] for tree_file_name in trees_to_parse: news_id = utils.get_root_id(tree_file_name) label = labels[news_id] if (not self.only_binary) or (label in ['false', 'true']): node_features, edges = self.build_tree(tree_file_name, tweet_fts=preprocessed_tweet_fts, user_fts=preprocessed_user_fts) trees.append((news_id, label, node_features, edges)) self.oversample(trees, ids_to_dataset, ratio=oversampling_ratio) for news_id, label, node_features, edges in trees: if dataset_type == "graph": import torch_geometric x = torch.tensor(node_features, dtype=torch.float32) y = torch.tensor(utils.to_label(label)) edge_index = np.array([edge[:2] for edge in edges], dtype=int) # change if you want the time somewhere edge_index = torch.tensor(edge_index).t().contiguous() if on_gpu: y.to(torch.device("cuda")) x.to(torch.device("cuda")) edge_index.to(torch.device("cuda")) data_point = torch_geometric.data.Data(x=x, y=y, edge_index=edge_index) if on_gpu: data_point.to(torch.device("cuda")) dataset[ids_to_dataset[news_id]].append(data_point) # Uncomment for test, to see if graphs are well created # if news_id in [580320684305416192, 387021726007042051]: # basic_tests.inspect_graph(dataset[ids_to_dataset[news_id]][-1], news_id) elif dataset_type == "sequential": y = utils.to_label(label) sequential_data = np.array( node_features) # If we go for this one, returns the features of the successive new tweet-user tuples encountered over time dataset[ids_to_dataset[news_id]].append([sequential_data, y]) # print(sequential_data.mean(dim=0)) # print("label was {}".format(label)) elif dataset_type == "raw": dataset[ids_to_dataset[news_id]].append( [[label, news_id] + edge + list(node_features[edge[1]]) for edge in edges]) # edge = [node_index_in, node_index_out, time_out, uid_in, uid_out] print(f"Dataset loaded in {time.time() - start_time:.3f}s") return dataset
from utils import to_label, cal_accuracy from net import FC_Net epochs=50000 learning_rate=0.1 # train linear net_linear = FC_Net(4, 4) x_train, y_train = generate_linear() linear_history = net_linear.train(x_train, y_train, epochs=epochs, learning_rate=learning_rate) # test linear y_hat = net_linear.test(x_train) print('Testing output:') print(y_hat) y_hat_label = to_label(y_hat) show_result(x_train, y_train, y_hat_label) print(f'Accuracy = {cal_accuracy(y_train, y_hat)}') # train XOR net_XOR = FC_Net(10, 10) x_train, y_train = generate_XOR_easy() xor_history = net_XOR.train(x_train, y_train, epochs=epochs, learning_rate=learning_rate) # test XOR y_hat = net_XOR.test(x_train) print('Testing output:') print(y_hat) y_hat_label = to_label(y_hat) show_result(x_train, y_train, y_hat_label) print(f'Accuracy = {cal_accuracy(y_train, y_hat)}')
def run_with_boosting_features(self): i = 0 self.new_labels = [] self.combined_labels = [] for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences): feature_vectors, new_labels = self.hmm_new.decode( raw_segment, True, True, self.token_BGM, self.pattern_BGM) self.new_labels.append(new_labels) tokens = Tokens(raw_segment).tokens print i, ': ', raw_segment # Combination step: tmp_combined_labels = [] # the decided combined labels so far for token, old_label, new_label, feature_vector in zip( tokens, label_sequence, new_labels, feature_vectors): # Combine old and new labels to come out a combined label, and deciding... combined_label = -1 if old_label == new_label: combined_label = new_label tmp_combined_labels.append(new_label) # Combine compatible labels: FN and LN elif old_label in [0, 1] and new_label in [0, 1]: combined_label = old_label tmp_combined_labels.append(new_label) # Combine labels that are not compatible else: tmp_feature_entity = self.hmm_new.feature_entity_list.lookup( feature_vector ) # Get the Background knowledge provided the feature vector: the language feature model sorted_label_distribution = sorted( tmp_feature_entity.label_distribution.iteritems(), key=operator.itemgetter(1), reverse=True) total_label_occurence = float( sum(tmp[1] for tmp in sorted_label_distribution)) # ============================================================================================ # ============================================================================================ # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED # sorted_label_distribution = [] # sum_prob = 0.0 # for pair in tmp_sorted_label_distribution: # sorted_label_distribution.append(pair) # sum_prob += pair[1] # if sum_prob/total_label_occurence >= 0.90: # break # ============================================================================================ # ============================================================================================ # Dominant label case: Iterate from the highest label stats according to this feature vector: for label_frequency in sorted_label_distribution: if int(label_frequency[0]) in [ old_label, new_label ] and (label_frequency[1] / total_label_occurence) >= self.DOMINANT_RATIO: print 'Dominant labels' # Check for constraint: tmp_label_to_check = int(label_frequency[0]) # Find last occurence position of this label if tmp_label_to_check not in [0, 1]: last_occurence = ''.join([ str(c) for c in tmp_combined_labels ]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0, 1]: last_occurence_0 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('0') last_occurence_1 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('1') last_occurence = max(last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == ( len(tmp_combined_labels) - 1 ): # Never occurred, or last occurence is the last label # When we are deciding the first label if len(tmp_combined_labels) == 0: first_bit = self.find_majority_structure( )[0] if first_bit == 0 and tmp_label_to_check not in [ 0, 1 ]: continue if first_bit == 3 and tmp_label_to_check != 3: continue # VN CANNOT FOLLOW TI W/O DL constraint if tmp_label_to_check == 4 and tmp_combined_labels[ -1] == 3: continue elif tmp_label_to_check in [0, 1]: flag = False for j in range(last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [0, 1, 2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: #???? continue combined_label = tmp_label_to_check tmp_combined_labels.append(tmp_label_to_check) break # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order if combined_label == -1: # Iterate from the highest label stats according to this feature vector: for label_frequency in sorted_label_distribution: breakout_flag = False #Test against constraints # 1. DL separate labels principle # 2. AU-TI-VN Order if int(label_frequency[0]) in [ old_label, new_label ]: tmp_label_to_check = int(label_frequency[0]) # find structure of the order, and find what have appeared, and so predict what to be appear next structure_overview = [ ] #will record the order in big sense: 0,3,4/4,0,3 for tmp_combined_label in tmp_combined_labels: if tmp_combined_label in [2, 5]: continue elif tmp_combined_label in [0, 1]: if 0 in structure_overview: continue else: structure_overview.append(0) elif tmp_combined_label == 3: if 3 in structure_overview: continue else: structure_overview.append(3) elif tmp_combined_label == 4: if 4 in structure_overview: continue else: structure_overview.append(4) # Based on the structure overview, find what should appear next appear_next = [] if structure_overview == [0]: appear_next = [0, 1, 3, 2, 5] elif structure_overview == [3]: appear_next = [3, 0, 1, 2, 5] elif structure_overview == [0, 3]: appear_next = [3, 4, 2, 5] elif structure_overview == [3, 0]: appear_next = [0, 1, 4, 2, 5] elif structure_overview == [0, 3, 4]: appear_next = [4, 2, 5] elif structure_overview == [3, 0, 4]: appear_next = [4, 2, 5] else: #weird case print 'Weird structure! Weird case!' if tmp_feature_entity.label_distribution[str( old_label )] > tmp_feature_entity.label_distribution[ str(new_label)]: tmp_label_to_check_list = [ old_label, new_label ] else: tmp_label_to_check_list = [ new_label, old_label ] # Apply constraints here too for tmp_label_to_check in tmp_label_to_check_list: if tmp_label_to_check not in [0, 1]: last_occurence = ''.join([ str(c) for c in tmp_combined_labels ]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0, 1]: last_occurence_0 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('0') last_occurence_1 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('1') last_occurence = max( last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == ( len(tmp_combined_labels) - 1): # When we are deciding the first label if len(tmp_combined_labels) == 0: first_bit = self.find_majority_structure( )[0] if first_bit == 0 and tmp_label_to_check not in [ 0, 1 ]: continue if first_bit == 3 and tmp_label_to_check != 3: continue try: if tmp_label_to_check == 4 and tmp_combined_labels[ -1] == 3: continue except: continue elif tmp_label_to_check in [0, 1]: flag = False for j in range( last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[ j] not in [0, 1, 2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: continue combined_label = tmp_label_to_check tmp_combined_labels.append( combined_label) breakout_flag = True break if breakout_flag: break if tmp_label_to_check in appear_next: # Then check constraint. find last occurence, DL constraints # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration if tmp_label_to_check not in [0, 1]: last_occurence = ''.join([ str(c) for c in tmp_combined_labels ]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0, 1]: last_occurence_0 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('0') last_occurence_1 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('1') last_occurence = max( last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == ( len(tmp_combined_labels) - 1): if tmp_label_to_check == 4 and tmp_combined_labels[ -1] == 3: #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect continue elif tmp_label_to_check in [0, 1]: flag = False for j in range( last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [ 0, 1, 2 ]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue # flag = False # for j in range(last_occurence, len(tmp_combined_labels)): # if tmp_combined_labels[j] not in [3,2]: # flag = True # break # if flag: # continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: #???? continue # elif tmp_label_to_check == 2: # elif tmp_label_to_check == 5: # Otherwise, pass log_err('\t\t' + str(i) + 'Should combine this one') combined_label = tmp_label_to_check tmp_combined_labels.append( tmp_label_to_check) # combined_label = (tmp_label_to_check, sorted_label_distribution) break else: continue # Debug if combined_label == -1: log_err(str(i) + 'problem') combined_label = (appear_next, sorted_label_distribution) tmp_combined_labels.append(-1) # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit print '==========================tmp_combined_labels', tmp_combined_labels majority_order_structure = self.find_majority_structure()[1] majority_rate = self.find_majority_structure()[2] tmp_combined_labels_length = len(tmp_combined_labels) if majority_rate > 0.80 and majority_order_structure == [0, 3, 4]: # p1(phase1): author segments for p1 in range(tmp_combined_labels_length): if tmp_combined_labels[p1] in [0, 1, 2, 5]: continue else: break # p2(phase2): title segments for p2 in range(p1, tmp_combined_labels_length): if tmp_combined_labels[p2] == 3: continue else: break #p3(phase3): venue segments for p3 in range(p2, tmp_combined_labels_length): if tmp_combined_labels[p3] in [2, 5, 4]: continue else: break # Decision if p1 == 0: print 'Houston we got a SERIOUS problem!' log_err('Houston we got a SERIOUS problem!!!!!!!!') if p2 == p1: print 'Houston we got a problem!' for sp2 in range(p2, tmp_combined_labels_length): if tmp_combined_labels[sp2] != 2: tmp_combined_labels[sp2] = 3 else: break # should fix common mislabeling at this point now?????????? # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]: # ???? not sure if this is normal # # p1(phase1): title segments # for p1 in range(tmp_combined_labels_length): # if tmp_combined_labels[p1] in [3]: # continue # else: # break # # p2(phase2): author segments # for p2 in range(p1, tmp_combined_labels_length): # if tmp_combined_labels[p2] == 3: # continue # else: # break # #p3(phase3): venue segments # for p3 in range(p2, tmp_combined_labels_length): # if tmp_combined_labels[p3] in [2,5,4]: # continue # else: # break # # Decision # if p1 == 0: # print 'Houston we got a SERIOUS problem!' # log_err('Houston we got a SERIOUS problem!!!!!!!!') # if p2 == p1: # print 'Houston we got a problem!' # for sp2 in range(p2, tmp_combined_labels_length): # if tmp_combined_labels[sp2] != 2: # tmp_combined_labels[sp2] = 3 # else: # break for old_label, new_label, tmp_combined_label, token, feature_vector in zip( label_sequence, new_labels, tmp_combined_labels, tokens, feature_vectors): print to_label(old_label), '\t', to_label( new_label), '\t', to_label( tmp_combined_label), '\t', token, '\t', feature_vector print '\n' i += 1
def run_with_boosting_features(self): i = 0 self.new_labels = [] self.combined_labels = [] for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences): feature_vectors, new_labels = self.hmm_new.decode(raw_segment, True, True, self.token_BGM, self.pattern_BGM) self.new_labels.append(new_labels) tokens = Tokens(raw_segment).tokens print i, ': ', raw_segment # Combination step: tmp_combined_labels = [] # the decided combined labels so far for token, old_label, new_label, feature_vector in zip(tokens, label_sequence, new_labels, feature_vectors): # Combine old and new labels to come out a combined label, and deciding... combined_label = -1 if old_label == new_label: combined_label = new_label tmp_combined_labels.append(new_label) # Combine compatible labels: FN and LN elif old_label in [0,1] and new_label in [0,1]: combined_label = old_label tmp_combined_labels.append(new_label) # Combine labels that are not compatible else: tmp_feature_entity = self.hmm_new.feature_entity_list.lookup(feature_vector) # Get the Background knowledge provided the feature vector: the language feature model sorted_label_distribution = sorted(tmp_feature_entity.label_distribution.iteritems(), key=operator.itemgetter(1), reverse=True) total_label_occurence = float(sum(tmp[1] for tmp in sorted_label_distribution)) # ============================================================================================ # ============================================================================================ # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED # sorted_label_distribution = [] # sum_prob = 0.0 # for pair in tmp_sorted_label_distribution: # sorted_label_distribution.append(pair) # sum_prob += pair[1] # if sum_prob/total_label_occurence >= 0.90: # break # ============================================================================================ # ============================================================================================ # Dominant label case: Iterate from the highest label stats according to this feature vector: for label_frequency in sorted_label_distribution: if int(label_frequency[0]) in [old_label, new_label] and (label_frequency[1]/total_label_occurence)>=self.DOMINANT_RATIO: print 'Dominant labels' # Check for constraint: tmp_label_to_check = int(label_frequency[0]) # Find last occurence position of this label if tmp_label_to_check not in [0,1]: last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0,1]: last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0') last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1') last_occurence = max(last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1): # Never occurred, or last occurence is the last label # When we are deciding the first label if len(tmp_combined_labels) == 0: first_bit = self.find_majority_structure()[0] if first_bit == 0 and tmp_label_to_check not in [0,1]: continue if first_bit == 3 and tmp_label_to_check != 3: continue # VN CANNOT FOLLOW TI W/O DL constraint if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: continue elif tmp_label_to_check in [0,1]: flag = False for j in range(last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [0,1,2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: #???? continue combined_label = tmp_label_to_check tmp_combined_labels.append(tmp_label_to_check) break # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order if combined_label == -1: # Iterate from the highest label stats according to this feature vector: for label_frequency in sorted_label_distribution: breakout_flag = False #Test against constraints # 1. DL separate labels principle # 2. AU-TI-VN Order if int(label_frequency[0]) in [old_label, new_label]: tmp_label_to_check = int(label_frequency[0]) # find structure of the order, and find what have appeared, and so predict what to be appear next structure_overview = [] #will record the order in big sense: 0,3,4/4,0,3 for tmp_combined_label in tmp_combined_labels: if tmp_combined_label in [2,5]: continue elif tmp_combined_label in [0,1]: if 0 in structure_overview: continue else: structure_overview.append(0) elif tmp_combined_label == 3: if 3 in structure_overview: continue else: structure_overview.append(3) elif tmp_combined_label == 4: if 4 in structure_overview: continue else: structure_overview.append(4) # Based on the structure overview, find what should appear next appear_next = [] if structure_overview == [0]: appear_next = [0,1,3,2,5] elif structure_overview == [3]: appear_next = [3,0,1,2,5] elif structure_overview == [0,3]: appear_next = [3,4,2,5] elif structure_overview == [3,0]: appear_next = [0,1,4,2,5] elif structure_overview == [0,3,4]: appear_next = [4,2,5] elif structure_overview == [3,0,4]: appear_next = [4,2,5] else: #weird case print 'Weird structure! Weird case!' if tmp_feature_entity.label_distribution[str(old_label)] > tmp_feature_entity.label_distribution[str(new_label)]: tmp_label_to_check_list = [old_label, new_label] else: tmp_label_to_check_list = [new_label, old_label] # Apply constraints here too for tmp_label_to_check in tmp_label_to_check_list: if tmp_label_to_check not in [0,1]: last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0,1]: last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0') last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1') last_occurence = max(last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1): # When we are deciding the first label if len(tmp_combined_labels) == 0: first_bit = self.find_majority_structure()[0] if first_bit == 0 and tmp_label_to_check not in [0,1]: continue if first_bit == 3 and tmp_label_to_check != 3: continue try: if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: continue except: continue elif tmp_label_to_check in [0,1]: flag = False for j in range(last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [0,1,2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: continue combined_label = tmp_label_to_check tmp_combined_labels.append(combined_label) breakout_flag = True break if breakout_flag: break if tmp_label_to_check in appear_next: # Then check constraint. find last occurence, DL constraints # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration if tmp_label_to_check not in [0,1]: last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0,1]: last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0') last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1') last_occurence = max(last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1): if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect continue elif tmp_label_to_check in [0,1]: flag = False for j in range(last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [0,1,2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue # flag = False # for j in range(last_occurence, len(tmp_combined_labels)): # if tmp_combined_labels[j] not in [3,2]: # flag = True # break # if flag: # continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: #???? continue # elif tmp_label_to_check == 2: # elif tmp_label_to_check == 5: # Otherwise, pass log_err('\t\t' + str(i) + 'Should combine this one') combined_label = tmp_label_to_check tmp_combined_labels.append(tmp_label_to_check) # combined_label = (tmp_label_to_check, sorted_label_distribution) break else: continue # Debug if combined_label == -1: log_err(str(i) + 'problem') combined_label = (appear_next, sorted_label_distribution) tmp_combined_labels.append(-1) # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit print '==========================tmp_combined_labels', tmp_combined_labels majority_order_structure = self.find_majority_structure()[1] majority_rate = self.find_majority_structure()[2] tmp_combined_labels_length = len(tmp_combined_labels) if majority_rate > 0.80 and majority_order_structure == [0,3,4]: # p1(phase1): author segments for p1 in range(tmp_combined_labels_length): if tmp_combined_labels[p1] in [0,1,2,5]: continue else: break # p2(phase2): title segments for p2 in range(p1, tmp_combined_labels_length): if tmp_combined_labels[p2] == 3: continue else: break #p3(phase3): venue segments for p3 in range(p2, tmp_combined_labels_length): if tmp_combined_labels[p3] in [2,5,4]: continue else: break # Decision if p1 == 0: print 'Houston we got a SERIOUS problem!' log_err('Houston we got a SERIOUS problem!!!!!!!!') if p2 == p1: print 'Houston we got a problem!' for sp2 in range(p2, tmp_combined_labels_length): if tmp_combined_labels[sp2] != 2: tmp_combined_labels[sp2] = 3 else: break # should fix common mislabeling at this point now?????????? # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]: # ???? not sure if this is normal # # p1(phase1): title segments # for p1 in range(tmp_combined_labels_length): # if tmp_combined_labels[p1] in [3]: # continue # else: # break # # p2(phase2): author segments # for p2 in range(p1, tmp_combined_labels_length): # if tmp_combined_labels[p2] == 3: # continue # else: # break # #p3(phase3): venue segments # for p3 in range(p2, tmp_combined_labels_length): # if tmp_combined_labels[p3] in [2,5,4]: # continue # else: # break # # Decision # if p1 == 0: # print 'Houston we got a SERIOUS problem!' # log_err('Houston we got a SERIOUS problem!!!!!!!!') # if p2 == p1: # print 'Houston we got a problem!' # for sp2 in range(p2, tmp_combined_labels_length): # if tmp_combined_labels[sp2] != 2: # tmp_combined_labels[sp2] = 3 # else: # break for old_label, new_label, tmp_combined_label, token, feature_vector in zip(label_sequence, new_labels, tmp_combined_labels, tokens, feature_vectors): print to_label(old_label), '\t', to_label(new_label), '\t', to_label(tmp_combined_label), '\t', token, '\t', feature_vector print '\n' i+=1
def create_dataset(self, dataset_type="id_index", standardize_features=True, on_gpu=False, \ oversampling_ratio=1): """ Args: dataset_type:str. Has to be "train_val", "id_index" or "raw" Returns: dict with keys "train", "val", "test": """ if dataset_type not in ["train_val", "id_index", "raw"]: raise ValueError("Supported dataset types are: 'train_val', 'id_index', 'raw'.") start_time = time.time() trees_to_parse = get_tree_file_names(self.dataset_dir) labels = self.load_labels() # Create train-val-test split # Remove useless trees (i.e. with labels that we don't consider) news_ids_to_consider = list(labels.keys()) if self.only_binary: news_ids_to_consider = [news_id for news_id in news_ids_to_consider if labels[news_id] in ['false', 'true']] train_ids, val_ids = train_test_split(news_ids_to_consider, test_size=0.01, random_state=self.seed) train_ids, test_ids = train_test_split(train_ids, test_size=0.3, random_state=self.seed*7) print(f"Len train/val/test {len(train_ids)} {len(val_ids)} {len(test_ids)}") user_ids_in_train, tweet_ids_in_train = \ self.get_user_and_tweet_ids_in_train(trees_to_parse, train_ids) # tweet_features = self.load_tweet_features_bert() # print("tweet_features_size:{}".format(len(tweet_features))) tweet_features = self.load_tweet_features_one_hot() user_features = self.load_user_features() print("User features:") for key in user_features: for k in user_features[key]: print('\t'+k) break # preprocessed_tweet_fts = self.preprocess_tweet_features(tweet_features, tweet_ids_in_train) # preprocessed_user_fts = self.preprocess_user_features(user_features, user_ids_in_train, \ # standardize_features) # basic_tests.test_user_preprocessed_features(preprocessed_user_fts) ids_to_dataset = {news_id: 'train' for news_id in train_ids} ids_to_dataset.update({news_id: 'val' for news_id in val_ids}) ids_to_dataset.update({news_id: 'test' for news_id in test_ids}) print("Parsing trees...") trees = [] for tree_file_name in trees_to_parse: news_id = get_root_id(tree_file_name) label = labels[news_id] if (not self.only_binary) or (label in ['false', 'true']): retweeters, retweet_lens, time_outs = self.get_retweet_list(tree_file_name, user_features) # node_features, edges = self.build_tree(tree_file_name, tweet_fts=preprocessed_tweet_fts, # user_fts=preprocessed_user_fts) adj, retweeter_fts = self.get_retweeter_adj(news_id, retweeters, \ retweet_lens, time_outs, user_features) trees.append((news_id, label, retweeter_fts, tweet_features[news_id], adj)) print("trees num: {}".format(len(trees))) print("Generating dataset...") if dataset_type == "train_val": dataset = {'train': {'data_all':[], 'padded_docs':[], 'cos':[], 'label':[]}, 'val': {'data_all':[], 'padded_docs':[], 'cos':[], 'label':[]}, 'test': {'data_all':[], 'padded_docs':[], 'cos':[], 'label':[]}} elif dataset_type == "id_index": dataset = {} for news_id, label, retweeter_fts, tweet_feature, adj in trees: # dataset[news_id] = {'data_all':[], 'padded_docs':[], 'cos':[], 'label':[]} dataset[news_id] = {} data_all = [] padded_docs = [] cos = [] for news_id, label, retweeter_fts, tweet_feature, adj in trees: x = tweet_feature y = np.array(to_label(label)) retweeter_fts = retweeter_fts.astype('float') if dataset_type == "train_val": dataset[ids_to_dataset[news_id]]['data_all'].append(retweeter_fts) dataset[ids_to_dataset[news_id]]['padded_docs'].append(x) dataset[ids_to_dataset[news_id]]['cos'].append(adj) dataset[ids_to_dataset[news_id]]['label'].append(y) elif dataset_type == "id_index": dataset[news_id]['data_all'] = retweeter_fts dataset[news_id]['padded_docs'] = x dataset[news_id]['cos'] = adj dataset[news_id]['label'] = y # dataset[news_id]['data_all'].append(retweeter_fts) # dataset[news_id]['padded_docs'].append(x) # dataset[news_id]['cos'].append(adj) # dataset[news_id]['label'].append(y) # elif dataset_type == "sequential": # y = to_label(label) # sequential_data = np.array(node_features) # # If we go for this one, returns the features of the successive new tweet-user tuples # # encountered over time # dataset[ids_to_dataset[news_id]].append([sequential_data, y]) # # print(sequential_data.mean(dim=0)) # # print("label was {}".format(label)) # elif dataset_type == "raw": # dataset[ids_to_dataset[news_id]].append( # [[label, news_id] + edge + list(node_features[edge[1]]) for edge in # edges]) # edge = [node_index_in, node_index_out, time_out, uid_in, uid_out] if dataset_type == 'train_val': for key in dataset: # print(type(dataset[key]['data_all']), type(dataset[key]['data_all'][0])) dataset[key]['data_all'] = np.array(dataset[key]['data_all']) # print(dataset[key]['data_all'].shape) # dataset[key]['data_all'] = torch.from_numpy(dataset[key]['data_all']) dataset[key]['padded_docs'] = np.array(dataset[key]['padded_docs']) dataset[key]['cos'] = np.array(dataset[key]['cos']) dataset[key]['label'] = np.array(dataset[key]['label']) print(f"Dataset loaded in {time.time() - start_time:.3f}s") return dataset
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Nov 6 20:53:36 2018 @author: Arpit """ from utils import save_preds, get_songs, split_data, to_label from data_processing import get_image_data from model import Model # get names of all the songs songs = get_songs() # split them in training and valid sets according to given percentage songs_train, songs_valid = split_data(songs, 0.85) # get actual spectrogram(2d np.array) data for the songs X_train, Y_train = get_image_data('train', songs_train) X_valid, Y_valid = get_image_data('valid', songs_valid) # get names and spectrogram data for the final testing set(which is to be uploaded) X_test, keys = get_image_data('test') model = Model(False) model.train(X_train, Y_train, X_valid, Y_valid, 5000) preds = model.predict(X_test) preds = [to_label(pred) for pred in preds] save_preds(keys, preds, 'predictions.csv')
#fits and transforms a normalizer norm = Normalizer(norm='l1').fit(X) X = norm.transform(X) # gets fourier transformed data of validation set and transforms according to training set X_valid, Y_valid = get_song_data(validation_songs) X_valid = norm.transform(X_valid) # gets convolutional network and fits the data model = get_model(input_size) model.fit(X, Y, n_epoch=20, validation_set=(X_valid, Y_valid), show_metric=True) #prediction with voting is performed finally test_songs = get_songs('test') keys = [] preds = [] for song in test_songs: X, _ = get_song_data([song]) X = norm.transform(X) Y_ = model.predict(X) label = to_label(np.argmax(np.sum(Y_, axis=0))) preds.append(label) keys.append('.'.join(song.split('.')[:2]) + '.au') save_preds(keys, preds, 'predictions_f.csv')