예제 #1
0
 def run(self):
     i = 0
     self.new_labels = []
     for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences):
         new_labels = self.hmm_new.decode(raw_segment)[1]
         self.new_labels.append(new_labels)
         tokens = Tokens(raw_segment).tokens
         feature_vectors = FeatureGenerator(raw_segment).features
         print i, ':  ', raw_segment
         for token, old_label, new_label, feature_vector in zip(tokens, label_sequence, new_labels, feature_vectors):
             print to_label(old_label), '\t', to_label(new_label), '\t', token
             self.feature_entity_list.add_entity(feature_vector, old_label, token)   #???? Old label first
         print '\n'
         i+=1
예제 #2
0
 def run(self):
     i = 0
     self.new_labels = []
     for raw_segment, label_sequence in zip(self.raw_segments,
                                            self.label_sequences):
         new_labels = self.hmm_new.decode(raw_segment)[1]
         self.new_labels.append(new_labels)
         tokens = Tokens(raw_segment).tokens
         feature_vectors = FeatureGenerator(raw_segment).features
         print i, ':  ', raw_segment
         for token, old_label, new_label, feature_vector in zip(
                 tokens, label_sequence, new_labels, feature_vectors):
             print to_label(old_label), '\t', to_label(
                 new_label), '\t', token
             self.feature_entity_list.add_entity(
                 feature_vector, old_label, token)  #???? Old label first
         print '\n'
         i += 1
예제 #3
0
    def create_dataset(self, dataset_type="graph", standardize_features=True, on_gpu=False, oversampling_ratio=1):
        """
        Args:
            dataset_type:str. Has to be "graph", "sequential" or "raw"
        Returns:
            dict with keys "train", "val", "test":
                If dataset_type is "graph" contains list of
                    torch_geometric.data.Data(x=x, y=y, edge_index=edge_index)
                If dataset_type is "sequential" contains list of
                    (sequential_data, y)
        """
        if dataset_type not in ["graph", "sequential", "raw"]:
            raise ValueError("Supported dataset types are: 'graph', 'sequential', 'raw'.")

        start_time = time.time()

        trees_to_parse = utils.get_tree_file_names(self.dataset_dir)

        labels = self.load_labels()

        # Create train-val-test split
        # Remove useless trees (i.e. with labels that we don't consider)

        news_ids_to_consider = list(labels.keys())
        if self.only_binary:
            news_ids_to_consider = [news_id for news_id in news_ids_to_consider
                                    if labels[news_id] in ['false', 'true']]

        train_ids, val_ids = train_test_split(news_ids_to_consider, test_size=0.1, random_state=self.seed)
        train_ids, test_ids = train_test_split(train_ids, test_size=0.25, random_state=self.seed*7)
        print(f"Len train/val/test {len(train_ids)} {len(val_ids)} {len(test_ids)}")

        user_ids_in_train, tweet_ids_in_train = \
            self.get_user_and_tweet_ids_in_train(trees_to_parse, train_ids)

        tweet_features = self.load_tweet_features()
        user_features = self.load_user_features()

        if standardize_features:
            print("Standardizing features")

        preprocessed_tweet_fts = self.preprocess_tweet_features(tweet_features, tweet_ids_in_train)
        preprocessed_user_fts = self.preprocess_user_features(user_features, user_ids_in_train, standardize_features)

        # basic_tests.test_user_preprocessed_features(preprocessed_user_fts)

        ids_to_dataset = {news_id: 'train' for news_id in train_ids}
        ids_to_dataset.update({news_id: 'val' for news_id in val_ids})
        ids_to_dataset.update({news_id: 'test' for news_id in test_ids})

        dataset = {'train': [], 'val': [], 'test': []}

        trees = []

        for tree_file_name in trees_to_parse:
            news_id = utils.get_root_id(tree_file_name)
            label = labels[news_id]
            if (not self.only_binary) or (label in ['false', 'true']):
                node_features, edges = self.build_tree(tree_file_name, tweet_fts=preprocessed_tweet_fts,
                                                       user_fts=preprocessed_user_fts)
                trees.append((news_id, label, node_features, edges))

        self.oversample(trees, ids_to_dataset, ratio=oversampling_ratio)

        for news_id, label, node_features, edges in trees:

            if dataset_type == "graph":
                import torch_geometric
                x = torch.tensor(node_features, dtype=torch.float32)
                y = torch.tensor(utils.to_label(label))
                edge_index = np.array([edge[:2] for edge in edges],
                                      dtype=int)  # change if you want the time somewhere
                edge_index = torch.tensor(edge_index).t().contiguous()
                if on_gpu:
                    y.to(torch.device("cuda"))
                    x.to(torch.device("cuda"))
                    edge_index.to(torch.device("cuda"))
                data_point = torch_geometric.data.Data(x=x, y=y, edge_index=edge_index)
                if on_gpu:
                    data_point.to(torch.device("cuda"))
                dataset[ids_to_dataset[news_id]].append(data_point)

                # Uncomment for test, to see if graphs are well created
                # if news_id in [580320684305416192, 387021726007042051]:
                #     basic_tests.inspect_graph(dataset[ids_to_dataset[news_id]][-1], news_id)

            elif dataset_type == "sequential":
                y = utils.to_label(label)
                sequential_data = np.array(
                    node_features)  # If we go for this one, returns the features of the successive new tweet-user tuples encountered over time
                dataset[ids_to_dataset[news_id]].append([sequential_data, y])
                # print(sequential_data.mean(dim=0))
                # print("label was {}".format(label))
            elif dataset_type == "raw":
                dataset[ids_to_dataset[news_id]].append(
                    [[label, news_id] + edge + list(node_features[edge[1]]) for edge in
                     edges])  # edge = [node_index_in, node_index_out, time_out, uid_in, uid_out]

        print(f"Dataset loaded in {time.time() - start_time:.3f}s")

        return dataset
예제 #4
0
from utils import to_label, cal_accuracy
from net import FC_Net

epochs=50000
learning_rate=0.1

# train linear
net_linear = FC_Net(4, 4)
x_train, y_train = generate_linear()
linear_history = net_linear.train(x_train, y_train, epochs=epochs, learning_rate=learning_rate)

# test linear
y_hat = net_linear.test(x_train)
print('Testing output:')
print(y_hat)
y_hat_label = to_label(y_hat)
show_result(x_train, y_train, y_hat_label)
print(f'Accuracy = {cal_accuracy(y_train, y_hat)}')

# train XOR
net_XOR = FC_Net(10, 10)
x_train, y_train = generate_XOR_easy()
xor_history = net_XOR.train(x_train, y_train, epochs=epochs, learning_rate=learning_rate)

# test XOR
y_hat = net_XOR.test(x_train)
print('Testing output:')
print(y_hat)
y_hat_label = to_label(y_hat)
show_result(x_train, y_train, y_hat_label)
print(f'Accuracy = {cal_accuracy(y_train, y_hat)}')
예제 #5
0
    def run_with_boosting_features(self):
        i = 0
        self.new_labels = []
        self.combined_labels = []

        for raw_segment, label_sequence in zip(self.raw_segments,
                                               self.label_sequences):
            feature_vectors, new_labels = self.hmm_new.decode(
                raw_segment, True, True, self.token_BGM, self.pattern_BGM)
            self.new_labels.append(new_labels)
            tokens = Tokens(raw_segment).tokens
            print i, ':  ', raw_segment

            # Combination step:
            tmp_combined_labels = []  # the decided combined labels so far
            for token, old_label, new_label, feature_vector in zip(
                    tokens, label_sequence, new_labels, feature_vectors):

                # Combine old and new labels to come out a combined label, and deciding...
                combined_label = -1

                if old_label == new_label:
                    combined_label = new_label
                    tmp_combined_labels.append(new_label)

                # Combine compatible labels: FN and LN
                elif old_label in [0, 1] and new_label in [0, 1]:
                    combined_label = old_label
                    tmp_combined_labels.append(new_label)

                # Combine labels that are not compatible
                else:
                    tmp_feature_entity = self.hmm_new.feature_entity_list.lookup(
                        feature_vector
                    )  # Get the Background knowledge provided the feature vector: the language feature model
                    sorted_label_distribution = sorted(
                        tmp_feature_entity.label_distribution.iteritems(),
                        key=operator.itemgetter(1),
                        reverse=True)
                    total_label_occurence = float(
                        sum(tmp[1] for tmp in sorted_label_distribution))

                    # ============================================================================================
                    # ============================================================================================
                    # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED
                    # sorted_label_distribution = []
                    # sum_prob = 0.0
                    # for pair in tmp_sorted_label_distribution:
                    #     sorted_label_distribution.append(pair)
                    #     sum_prob += pair[1]
                    #     if sum_prob/total_label_occurence >= 0.90:
                    #         break
                    # ============================================================================================
                    # ============================================================================================

                    # Dominant label case: Iterate from the highest label stats according to this feature vector:
                    for label_frequency in sorted_label_distribution:
                        if int(label_frequency[0]) in [
                                old_label, new_label
                        ] and (label_frequency[1] /
                               total_label_occurence) >= self.DOMINANT_RATIO:
                            print 'Dominant labels'
                            # Check for constraint:
                            tmp_label_to_check = int(label_frequency[0])

                            # Find last occurence position of this label
                            if tmp_label_to_check not in [0, 1]:
                                last_occurence = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind(str(tmp_label_to_check))
                            elif tmp_label_to_check in [0, 1]:
                                last_occurence_0 = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind('0')
                                last_occurence_1 = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind('1')
                                last_occurence = max(last_occurence_0,
                                                     last_occurence_1)

                            # Checking constraints by simplifying what we did in viterbi
                            if last_occurence == -1 or last_occurence == (
                                    len(tmp_combined_labels) - 1
                            ):  # Never occurred, or last occurence is the last label
                                # When we are deciding the first label
                                if len(tmp_combined_labels) == 0:
                                    first_bit = self.find_majority_structure(
                                    )[0]
                                    if first_bit == 0 and tmp_label_to_check not in [
                                            0, 1
                                    ]:
                                        continue
                                    if first_bit == 3 and tmp_label_to_check != 3:
                                        continue

                                # VN CANNOT FOLLOW TI W/O DL constraint
                                if tmp_label_to_check == 4 and tmp_combined_labels[
                                        -1] == 3:
                                    continue
                            elif tmp_label_to_check in [0, 1]:
                                flag = False
                                for j in range(last_occurence,
                                               len(tmp_combined_labels)):
                                    if tmp_combined_labels[j] not in [0, 1, 2]:
                                        flag = True
                                        break
                                if flag:
                                    continue
                            elif tmp_label_to_check == 3:
                                continue
                            elif tmp_label_to_check == 4:
                                if tmp_combined_labels[-1] == 3:  #????
                                    continue

                            combined_label = tmp_label_to_check
                            tmp_combined_labels.append(tmp_label_to_check)
                            break

                    # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order
                    if combined_label == -1:
                        # Iterate from the highest label stats according to this feature vector:

                        for label_frequency in sorted_label_distribution:
                            breakout_flag = False
                            #Test against constraints
                            # 1. DL separate labels principle
                            # 2. AU-TI-VN Order
                            if int(label_frequency[0]) in [
                                    old_label, new_label
                            ]:
                                tmp_label_to_check = int(label_frequency[0])

                                # find structure of the order, and find what have appeared, and so predict what to be appear next
                                structure_overview = [
                                ]  #will record the order in big sense: 0,3,4/4,0,3
                                for tmp_combined_label in tmp_combined_labels:
                                    if tmp_combined_label in [2, 5]:
                                        continue
                                    elif tmp_combined_label in [0, 1]:
                                        if 0 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(0)
                                    elif tmp_combined_label == 3:
                                        if 3 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(3)
                                    elif tmp_combined_label == 4:
                                        if 4 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(4)
                                # Based on the structure overview, find what should appear next
                                appear_next = []
                                if structure_overview == [0]:
                                    appear_next = [0, 1, 3, 2, 5]
                                elif structure_overview == [3]:
                                    appear_next = [3, 0, 1, 2, 5]
                                elif structure_overview == [0, 3]:
                                    appear_next = [3, 4, 2, 5]
                                elif structure_overview == [3, 0]:
                                    appear_next = [0, 1, 4, 2, 5]
                                elif structure_overview == [0, 3, 4]:
                                    appear_next = [4, 2, 5]
                                elif structure_overview == [3, 0, 4]:
                                    appear_next = [4, 2, 5]
                                else:  #weird case
                                    print 'Weird structure! Weird case!'
                                    if tmp_feature_entity.label_distribution[str(
                                            old_label
                                    )] > tmp_feature_entity.label_distribution[
                                            str(new_label)]:
                                        tmp_label_to_check_list = [
                                            old_label, new_label
                                        ]
                                    else:
                                        tmp_label_to_check_list = [
                                            new_label, old_label
                                        ]
                                    # Apply constraints here too
                                    for tmp_label_to_check in tmp_label_to_check_list:
                                        if tmp_label_to_check not in [0, 1]:
                                            last_occurence = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind(str(tmp_label_to_check))
                                        elif tmp_label_to_check in [0, 1]:
                                            last_occurence_0 = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind('0')
                                            last_occurence_1 = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind('1')
                                            last_occurence = max(
                                                last_occurence_0,
                                                last_occurence_1)

                                        # Checking constraints by simplifying what we did in viterbi
                                        if last_occurence == -1 or last_occurence == (
                                                len(tmp_combined_labels) - 1):
                                            # When we are deciding the first label
                                            if len(tmp_combined_labels) == 0:
                                                first_bit = self.find_majority_structure(
                                                )[0]
                                                if first_bit == 0 and tmp_label_to_check not in [
                                                        0, 1
                                                ]:
                                                    continue
                                                if first_bit == 3 and tmp_label_to_check != 3:
                                                    continue
                                            try:
                                                if tmp_label_to_check == 4 and tmp_combined_labels[
                                                        -1] == 3:
                                                    continue
                                            except:
                                                continue
                                        elif tmp_label_to_check in [0, 1]:
                                            flag = False
                                            for j in range(
                                                    last_occurence,
                                                    len(tmp_combined_labels)):
                                                if tmp_combined_labels[
                                                        j] not in [0, 1, 2]:
                                                    flag = True
                                                    break
                                            if flag:
                                                continue
                                        elif tmp_label_to_check == 3:
                                            continue
                                        elif tmp_label_to_check == 4:
                                            if tmp_combined_labels[-1] == 3:
                                                continue

                                        combined_label = tmp_label_to_check
                                        tmp_combined_labels.append(
                                            combined_label)
                                        breakout_flag = True
                                        break

                                if breakout_flag:
                                    break
                                if tmp_label_to_check in appear_next:
                                    # Then check constraint. find last occurence, DL constraints
                                    # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration
                                    if tmp_label_to_check not in [0, 1]:
                                        last_occurence = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind(str(tmp_label_to_check))
                                    elif tmp_label_to_check in [0, 1]:
                                        last_occurence_0 = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind('0')
                                        last_occurence_1 = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind('1')
                                        last_occurence = max(
                                            last_occurence_0, last_occurence_1)

                                    # Checking constraints by simplifying what we did in viterbi
                                    if last_occurence == -1 or last_occurence == (
                                            len(tmp_combined_labels) - 1):
                                        if tmp_label_to_check == 4 and tmp_combined_labels[
                                                -1] == 3:  #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect
                                            continue
                                    elif tmp_label_to_check in [0, 1]:
                                        flag = False
                                        for j in range(
                                                last_occurence,
                                                len(tmp_combined_labels)):
                                            if tmp_combined_labels[j] not in [
                                                    0, 1, 2
                                            ]:
                                                flag = True
                                                break
                                        if flag:
                                            continue

                                    elif tmp_label_to_check == 3:
                                        continue
                                        # flag = False
                                        # for j in range(last_occurence, len(tmp_combined_labels)):
                                        #     if tmp_combined_labels[j] not in [3,2]:
                                        #         flag = True
                                        #         break
                                        # if flag:
                                        #     continue

                                    elif tmp_label_to_check == 4:
                                        if tmp_combined_labels[-1] == 3:  #????
                                            continue

                                    # elif tmp_label_to_check == 2:
                                    # elif tmp_label_to_check == 5:

                                    # Otherwise, pass
                                    log_err('\t\t' + str(i) +
                                            'Should combine this one')
                                    combined_label = tmp_label_to_check
                                    tmp_combined_labels.append(
                                        tmp_label_to_check)
                                    # combined_label = (tmp_label_to_check, sorted_label_distribution)
                                    break

                                else:
                                    continue

                        # Debug
                        if combined_label == -1:
                            log_err(str(i) + 'problem')
                            combined_label = (appear_next,
                                              sorted_label_distribution)
                            tmp_combined_labels.append(-1)

            # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit
            print '==========================tmp_combined_labels', tmp_combined_labels
            majority_order_structure = self.find_majority_structure()[1]
            majority_rate = self.find_majority_structure()[2]
            tmp_combined_labels_length = len(tmp_combined_labels)
            if majority_rate > 0.80 and majority_order_structure == [0, 3, 4]:
                # p1(phase1): author segments
                for p1 in range(tmp_combined_labels_length):
                    if tmp_combined_labels[p1] in [0, 1, 2, 5]:
                        continue
                    else:
                        break

                # p2(phase2): title segments
                for p2 in range(p1, tmp_combined_labels_length):
                    if tmp_combined_labels[p2] == 3:
                        continue
                    else:
                        break

                #p3(phase3): venue segments
                for p3 in range(p2, tmp_combined_labels_length):
                    if tmp_combined_labels[p3] in [2, 5, 4]:
                        continue
                    else:
                        break

                # Decision
                if p1 == 0:
                    print 'Houston we got a SERIOUS problem!'
                    log_err('Houston we got a SERIOUS problem!!!!!!!!')

                if p2 == p1:
                    print 'Houston we got a problem!'
                    for sp2 in range(p2, tmp_combined_labels_length):
                        if tmp_combined_labels[sp2] != 2:
                            tmp_combined_labels[sp2] = 3
                        else:
                            break  # should fix common mislabeling at this point now??????????

            # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]:    # ???? not sure if this is normal
            #     # p1(phase1): title segments
            #     for p1 in range(tmp_combined_labels_length):
            #         if tmp_combined_labels[p1] in [3]:
            #             continue
            #         else:
            #             break

            #     # p2(phase2): author segments
            #     for p2 in range(p1, tmp_combined_labels_length):
            #         if tmp_combined_labels[p2] == 3:
            #             continue
            #         else:
            #             break

            #     #p3(phase3): venue segments
            #     for p3 in range(p2, tmp_combined_labels_length):
            #         if tmp_combined_labels[p3] in [2,5,4]:
            #             continue
            #         else:
            #             break

            #     # Decision
            #     if p1 == 0:
            #         print 'Houston we got a SERIOUS problem!'
            #         log_err('Houston we got a SERIOUS problem!!!!!!!!')

            #     if p2 == p1:
            #         print 'Houston we got a problem!'
            #         for sp2 in range(p2, tmp_combined_labels_length):
            #             if tmp_combined_labels[sp2] != 2:
            #                 tmp_combined_labels[sp2] = 3
            #             else:
            #                 break
            for old_label, new_label, tmp_combined_label, token, feature_vector in zip(
                    label_sequence, new_labels, tmp_combined_labels, tokens,
                    feature_vectors):
                print to_label(old_label), '\t', to_label(
                    new_label), '\t', to_label(
                        tmp_combined_label), '\t', token, '\t', feature_vector
            print '\n'
            i += 1
예제 #6
0
    def run_with_boosting_features(self):
        i = 0
        self.new_labels = []
        self.combined_labels = []

        for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences):
            feature_vectors, new_labels = self.hmm_new.decode(raw_segment, True, True, self.token_BGM, self.pattern_BGM)
            self.new_labels.append(new_labels)
            tokens = Tokens(raw_segment).tokens
            print i, ':  ', raw_segment

            # Combination step: 
            tmp_combined_labels = []    # the decided combined labels so far
            for token, old_label, new_label, feature_vector in zip(tokens, label_sequence, new_labels, feature_vectors):

                # Combine old and new labels to come out a combined label, and deciding...
                combined_label = -1
                
                if old_label == new_label:
                    combined_label = new_label
                    tmp_combined_labels.append(new_label)
                
                # Combine compatible labels: FN and LN
                elif old_label in [0,1] and new_label in [0,1]:
                    combined_label = old_label
                    tmp_combined_labels.append(new_label)
                
                # Combine labels that are not compatible
                else:   
                    tmp_feature_entity = self.hmm_new.feature_entity_list.lookup(feature_vector)    # Get the Background knowledge provided the feature vector: the language feature model
                    sorted_label_distribution = sorted(tmp_feature_entity.label_distribution.iteritems(), key=operator.itemgetter(1), reverse=True)
                    total_label_occurence = float(sum(tmp[1] for tmp in sorted_label_distribution))

                    

                    # ============================================================================================
                    # ============================================================================================
                    # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED 
                    # sorted_label_distribution = []
                    # sum_prob = 0.0
                    # for pair in tmp_sorted_label_distribution:
                    #     sorted_label_distribution.append(pair)
                    #     sum_prob += pair[1]
                    #     if sum_prob/total_label_occurence >= 0.90:
                    #         break
                    # ============================================================================================
                    # ============================================================================================



                    # Dominant label case: Iterate from the highest label stats according to this feature vector:
                    for label_frequency in sorted_label_distribution:
                        if int(label_frequency[0]) in [old_label, new_label] and (label_frequency[1]/total_label_occurence)>=self.DOMINANT_RATIO:
                            print 'Dominant labels'
                            # Check for constraint:
                            tmp_label_to_check = int(label_frequency[0])
                            
                            # Find last occurence position of this label
                            if tmp_label_to_check not in [0,1]:
                                last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                            elif tmp_label_to_check in [0,1]:
                                last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                last_occurence = max(last_occurence_0, last_occurence_1)

                            # Checking constraints by simplifying what we did in viterbi
                            if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):  # Never occurred, or last occurence is the last label
                                # When we are deciding the first label
                                if len(tmp_combined_labels) == 0:
                                    first_bit = self.find_majority_structure()[0]
                                    if first_bit == 0 and tmp_label_to_check not in [0,1]:
                                        continue
                                    if first_bit == 3 and tmp_label_to_check != 3:
                                        continue

                                # VN CANNOT FOLLOW TI W/O DL constraint
                                if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3:
                                    continue
                            elif tmp_label_to_check in [0,1]:
                                flag = False
                                for j in range(last_occurence, len(tmp_combined_labels)):
                                    if tmp_combined_labels[j] not in [0,1,2]:
                                        flag = True
                                        break
                                if flag:
                                    continue
                            elif tmp_label_to_check == 3:
                                continue
                            elif tmp_label_to_check == 4:
                                if tmp_combined_labels[-1] == 3:    #????
                                    continue

                            combined_label = tmp_label_to_check
                            tmp_combined_labels.append(tmp_label_to_check)
                            break
                    
                    # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order
                    if combined_label == -1:
                        # Iterate from the highest label stats according to this feature vector:

                        for label_frequency in sorted_label_distribution:
                            breakout_flag = False
                            #Test against constraints
                            # 1. DL separate labels principle
                            # 2. AU-TI-VN Order 
                            if int(label_frequency[0]) in [old_label, new_label]:
                                tmp_label_to_check = int(label_frequency[0])
                                
                                # find structure of the order, and find what have appeared, and so predict what to be appear next
                                structure_overview = []     #will record the order in big sense: 0,3,4/4,0,3
                                for tmp_combined_label in tmp_combined_labels:
                                    if tmp_combined_label in [2,5]:
                                        continue                                            
                                    elif tmp_combined_label in [0,1]:
                                        if 0 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(0)
                                    elif tmp_combined_label == 3:
                                        if 3 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(3)
                                    elif tmp_combined_label == 4:
                                        if 4 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(4)
                                # Based on the structure overview, find what should appear next
                                appear_next = []
                                if structure_overview == [0]:
                                    appear_next = [0,1,3,2,5]
                                elif structure_overview == [3]:
                                    appear_next = [3,0,1,2,5]
                                elif structure_overview == [0,3]:
                                    appear_next = [3,4,2,5]
                                elif structure_overview == [3,0]:
                                    appear_next = [0,1,4,2,5]
                                elif structure_overview == [0,3,4]:
                                    appear_next = [4,2,5]
                                elif structure_overview == [3,0,4]:
                                    appear_next = [4,2,5]
                                else:   #weird case
                                    print 'Weird structure! Weird case!'
                                    if tmp_feature_entity.label_distribution[str(old_label)] > tmp_feature_entity.label_distribution[str(new_label)]:
                                        tmp_label_to_check_list = [old_label, new_label]
                                    else:
                                        tmp_label_to_check_list = [new_label, old_label]
                                    # Apply constraints here too
                                    for tmp_label_to_check in tmp_label_to_check_list:
                                        if tmp_label_to_check not in [0,1]:
                                            last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                                        elif tmp_label_to_check in [0,1]:
                                            last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                            last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                            last_occurence = max(last_occurence_0, last_occurence_1)

                                        # Checking constraints by simplifying what we did in viterbi
                                        if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):
                                            # When we are deciding the first label
                                            if len(tmp_combined_labels) == 0:
                                                first_bit = self.find_majority_structure()[0]
                                                if first_bit == 0 and tmp_label_to_check not in [0,1]:
                                                    continue
                                                if first_bit == 3 and tmp_label_to_check != 3:
                                                    continue
                                            try:
                                                if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3:
                                                    continue
                                            except:
                                                continue
                                        elif tmp_label_to_check in [0,1]:
                                            flag = False
                                            for j in range(last_occurence, len(tmp_combined_labels)):
                                                if tmp_combined_labels[j] not in [0,1,2]:
                                                    flag = True
                                                    break
                                            if flag:
                                                continue
                                        elif tmp_label_to_check == 3:
                                            continue
                                        elif tmp_label_to_check == 4:
                                            if tmp_combined_labels[-1] == 3:
                                                continue

                                        combined_label = tmp_label_to_check
                                        tmp_combined_labels.append(combined_label)
                                        breakout_flag = True
                                        break

                                if breakout_flag:
                                    break
                                if tmp_label_to_check in appear_next:
                                    # Then check constraint. find last occurence, DL constraints
                                    # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration
                                    if tmp_label_to_check not in [0,1]:
                                        last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                                    elif tmp_label_to_check in [0,1]:
                                        last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                        last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                        last_occurence = max(last_occurence_0, last_occurence_1)

                                    # Checking constraints by simplifying what we did in viterbi
                                    if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):
                                        if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect
                                            continue
                                    elif tmp_label_to_check in [0,1]:
                                        flag = False
                                        for j in range(last_occurence, len(tmp_combined_labels)):
                                            if tmp_combined_labels[j] not in [0,1,2]:
                                                flag = True
                                                break
                                        if flag:
                                            continue

                                    elif tmp_label_to_check == 3:
                                        continue
                                        # flag = False
                                        # for j in range(last_occurence, len(tmp_combined_labels)):
                                        #     if tmp_combined_labels[j] not in [3,2]:
                                        #         flag = True
                                        #         break
                                        # if flag:
                                        #     continue

                                    elif tmp_label_to_check == 4:
                                        if tmp_combined_labels[-1] == 3:    #????
                                            continue

                                    # elif tmp_label_to_check == 2:
                                    # elif tmp_label_to_check == 5:
                                    
                                    # Otherwise, pass
                                    log_err('\t\t' + str(i) + 'Should combine this one')
                                    combined_label = tmp_label_to_check
                                    tmp_combined_labels.append(tmp_label_to_check)
                                    # combined_label = (tmp_label_to_check, sorted_label_distribution)
                                    break
                                    
                                else:
                                    continue

                        # Debug
                        if combined_label == -1:
                            log_err(str(i) + 'problem')
                            combined_label = (appear_next, sorted_label_distribution)
                            tmp_combined_labels.append(-1)


            # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit
            print '==========================tmp_combined_labels', tmp_combined_labels
            majority_order_structure = self.find_majority_structure()[1]
            majority_rate = self.find_majority_structure()[2]
            tmp_combined_labels_length = len(tmp_combined_labels)
            if majority_rate > 0.80 and majority_order_structure == [0,3,4]:
                # p1(phase1): author segments
                for p1 in range(tmp_combined_labels_length):
                    if tmp_combined_labels[p1] in [0,1,2,5]:
                        continue
                    else:
                        break

                # p2(phase2): title segments
                for p2 in range(p1, tmp_combined_labels_length):
                    if tmp_combined_labels[p2] == 3:
                        continue
                    else:
                        break

                #p3(phase3): venue segments
                for p3 in range(p2, tmp_combined_labels_length):
                    if tmp_combined_labels[p3] in [2,5,4]:
                        continue
                    else:
                        break

                # Decision
                if p1 == 0:
                    print 'Houston we got a SERIOUS problem!'
                    log_err('Houston we got a SERIOUS problem!!!!!!!!')

                if p2 == p1:
                    print 'Houston we got a problem!'
                    for sp2 in range(p2, tmp_combined_labels_length):
                        if tmp_combined_labels[sp2] != 2:
                            tmp_combined_labels[sp2] = 3
                        else:
                            break   # should fix common mislabeling at this point now??????????


            # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]:    # ???? not sure if this is normal
            #     # p1(phase1): title segments
            #     for p1 in range(tmp_combined_labels_length):
            #         if tmp_combined_labels[p1] in [3]:
            #             continue
            #         else:
            #             break

            #     # p2(phase2): author segments
            #     for p2 in range(p1, tmp_combined_labels_length):
            #         if tmp_combined_labels[p2] == 3:
            #             continue
            #         else:
            #             break

            #     #p3(phase3): venue segments
            #     for p3 in range(p2, tmp_combined_labels_length):
            #         if tmp_combined_labels[p3] in [2,5,4]:
            #             continue
            #         else:
            #             break

            #     # Decision
            #     if p1 == 0:
            #         print 'Houston we got a SERIOUS problem!'
            #         log_err('Houston we got a SERIOUS problem!!!!!!!!')

            #     if p2 == p1:
            #         print 'Houston we got a problem!'
            #         for sp2 in range(p2, tmp_combined_labels_length):
            #             if tmp_combined_labels[sp2] != 2:
            #                 tmp_combined_labels[sp2] = 3
            #             else:
            #                 break
            for old_label, new_label, tmp_combined_label, token, feature_vector in zip(label_sequence, new_labels, tmp_combined_labels, tokens, feature_vectors):
                print to_label(old_label), '\t', to_label(new_label), '\t', to_label(tmp_combined_label), '\t', token, '\t', feature_vector
            print '\n'
            i+=1
예제 #7
0
    def create_dataset(self, dataset_type="id_index", standardize_features=True, on_gpu=False, \
        oversampling_ratio=1):
        """
        Args:
            dataset_type:str. Has to be "train_val", "id_index" or "raw"
        Returns:
            dict with keys "train", "val", "test":
        """
        if dataset_type not in ["train_val", "id_index", "raw"]:
            raise ValueError("Supported dataset types are: 'train_val', 'id_index', 'raw'.")

        start_time = time.time()

        trees_to_parse = get_tree_file_names(self.dataset_dir)

        labels = self.load_labels()

        # Create train-val-test split
        # Remove useless trees (i.e. with labels that we don't consider)

        news_ids_to_consider = list(labels.keys())
        if self.only_binary:
            news_ids_to_consider = [news_id for news_id in news_ids_to_consider
                                    if labels[news_id] in ['false', 'true']]

        train_ids, val_ids = train_test_split(news_ids_to_consider, test_size=0.01, random_state=self.seed)
        train_ids, test_ids = train_test_split(train_ids, test_size=0.3, random_state=self.seed*7)
        print(f"Len train/val/test {len(train_ids)} {len(val_ids)} {len(test_ids)}")

        user_ids_in_train, tweet_ids_in_train = \
            self.get_user_and_tweet_ids_in_train(trees_to_parse, train_ids)



        # tweet_features = self.load_tweet_features_bert()
        # print("tweet_features_size:{}".format(len(tweet_features)))
        tweet_features = self.load_tweet_features_one_hot()
        user_features = self.load_user_features()

        print("User features:")
        for key in user_features:
            for k in user_features[key]:
                print('\t'+k)
            break

        # preprocessed_tweet_fts = self.preprocess_tweet_features(tweet_features, tweet_ids_in_train)
        
        # preprocessed_user_fts = self.preprocess_user_features(user_features, user_ids_in_train, \
        #     standardize_features)
        
        # basic_tests.test_user_preprocessed_features(preprocessed_user_fts)

        ids_to_dataset = {news_id: 'train' for news_id in train_ids}
        ids_to_dataset.update({news_id: 'val' for news_id in val_ids})
        ids_to_dataset.update({news_id: 'test' for news_id in test_ids})
        
        print("Parsing trees...")
        
        trees = []
        for tree_file_name in trees_to_parse:
            news_id = get_root_id(tree_file_name)
            label = labels[news_id]
            if (not self.only_binary) or (label in ['false', 'true']):
                
                retweeters, retweet_lens, time_outs = self.get_retweet_list(tree_file_name, user_features)
                # node_features, edges = self.build_tree(tree_file_name, tweet_fts=preprocessed_tweet_fts,
                #                                        user_fts=preprocessed_user_fts)
                
                adj, retweeter_fts = self.get_retweeter_adj(news_id, retweeters, \
                    retweet_lens, time_outs, user_features)
                trees.append((news_id, label, retweeter_fts, tweet_features[news_id], adj))
                
        print("trees num: {}".format(len(trees)))
        print("Generating dataset...")

        if dataset_type == "train_val":
            dataset = {'train': {'data_all':[], 'padded_docs':[], 'cos':[], 'label':[]}, 
                'val': {'data_all':[], 'padded_docs':[], 'cos':[], 'label':[]}, 
                'test': {'data_all':[], 'padded_docs':[], 'cos':[], 'label':[]}}
        elif dataset_type == "id_index":
            dataset = {}
            for news_id, label, retweeter_fts, tweet_feature, adj in trees:
                # dataset[news_id] = {'data_all':[], 'padded_docs':[], 'cos':[], 'label':[]}
                dataset[news_id] = {}
                
        data_all = []
        padded_docs = []
        cos = []
        for news_id, label, retweeter_fts, tweet_feature, adj in trees:
            x = tweet_feature
            y = np.array(to_label(label))
            retweeter_fts = retweeter_fts.astype('float')
            if dataset_type == "train_val":
                dataset[ids_to_dataset[news_id]]['data_all'].append(retweeter_fts)
                dataset[ids_to_dataset[news_id]]['padded_docs'].append(x)
                dataset[ids_to_dataset[news_id]]['cos'].append(adj)
                dataset[ids_to_dataset[news_id]]['label'].append(y)
            elif dataset_type == "id_index":
                dataset[news_id]['data_all'] = retweeter_fts
                dataset[news_id]['padded_docs'] = x
                dataset[news_id]['cos'] = adj
                dataset[news_id]['label'] = y
                # dataset[news_id]['data_all'].append(retweeter_fts)
                # dataset[news_id]['padded_docs'].append(x)
                # dataset[news_id]['cos'].append(adj)
                # dataset[news_id]['label'].append(y)
            # elif dataset_type == "sequential":
            #     y = to_label(label)
            #     sequential_data = np.array(node_features)  
            #     # If we go for this one, returns the features of the successive new tweet-user tuples 
            #     # encountered over time
            #     dataset[ids_to_dataset[news_id]].append([sequential_data, y])
            #     # print(sequential_data.mean(dim=0))
            #     # print("label was {}".format(label))
            # elif dataset_type == "raw":
            #     dataset[ids_to_dataset[news_id]].append(
            #         [[label, news_id] + edge + list(node_features[edge[1]]) for edge in
            #          edges])  # edge = [node_index_in, node_index_out, time_out, uid_in, uid_out]
        
        if dataset_type == 'train_val':
            for key in dataset:
                # print(type(dataset[key]['data_all']), type(dataset[key]['data_all'][0]))
                dataset[key]['data_all'] = np.array(dataset[key]['data_all'])
                # print(dataset[key]['data_all'].shape)
                # dataset[key]['data_all'] = torch.from_numpy(dataset[key]['data_all'])
                dataset[key]['padded_docs'] = np.array(dataset[key]['padded_docs'])
                dataset[key]['cos'] = np.array(dataset[key]['cos'])
                dataset[key]['label'] = np.array(dataset[key]['label'])
        print(f"Dataset loaded in {time.time() - start_time:.3f}s")
        return dataset
예제 #8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov  6 20:53:36 2018

@author: Arpit
"""

from utils import save_preds, get_songs, split_data, to_label
from data_processing import get_image_data
from model import Model

# get names of all the songs
songs = get_songs()

# split them in training and valid sets according to given percentage
songs_train, songs_valid = split_data(songs, 0.85)

# get actual spectrogram(2d np.array) data for the songs
X_train, Y_train = get_image_data('train', songs_train)
X_valid, Y_valid = get_image_data('valid', songs_valid)

# get names and spectrogram data for the final testing set(which is to be uploaded)
X_test, keys = get_image_data('test')

model = Model(False)
model.train(X_train, Y_train, X_valid, Y_valid, 5000)

preds = model.predict(X_test)
preds = [to_label(pred) for pred in preds]
save_preds(keys, preds, 'predictions.csv')
예제 #9
0
#fits and transforms a normalizer
norm = Normalizer(norm='l1').fit(X)
X = norm.transform(X)

# gets fourier transformed data of validation set and transforms according to training set
X_valid, Y_valid = get_song_data(validation_songs)
X_valid = norm.transform(X_valid)

# gets convolutional network and fits the data
model = get_model(input_size)
model.fit(X,
          Y,
          n_epoch=20,
          validation_set=(X_valid, Y_valid),
          show_metric=True)

#prediction with voting is performed finally
test_songs = get_songs('test')
keys = []
preds = []
for song in test_songs:
    X, _ = get_song_data([song])
    X = norm.transform(X)
    Y_ = model.predict(X)
    label = to_label(np.argmax(np.sum(Y_, axis=0)))
    preds.append(label)
    keys.append('.'.join(song.split('.')[:2]) + '.au')

save_preds(keys, preds, 'predictions_f.csv')