示例#1
0
class CarbonaraBros():
    def __init__(self, relevant_threshold=0.8):
        self.fe = FeaturesExtractor()
        self.relevant_threshold = relevant_threshold
        self.tableClassifier = Classifier('models/table_classifier.h5')
        self.listClassifier = Classifier('models/list_classifier.h5')

    def processDom(self, dom):
        analysis = {
            'table': {
                'relevant': [],
                'not_relevant': [],
            },
            'list': {
                'relevant': [],
                'not_relevant': []
            }
        }

        # table
        for table in dom.xpath("//table"):
            features = self.fe.extract(
                table,
                selected=DefaultFeatures.table_selected,
                features_descriptor=DefaultFeatures.table)
            features_array = self.fe.toArray(features)
            probabilities = self.tableClassifier.classify(features_array)

            score = probabilities[1]
            if score >= self.relevant_threshold:
                analysis['table']['relevant'].append((score, table))
            else:
                analysis['table']['not_relevant'].append((score, table))

        lists = dom.xpath("//ul")
        lists = lists + dom.xpath("//ol")
        lists = lists + dom.xpath("//dl")

        for list in lists:
            features = self.fe.extract(
                list,
                selected=DefaultFeatures.list_selected,
                features_descriptor=DefaultFeatures.list)
            features_array = self.fe.toArray(features)
            probabilities = self.listClassifier.classify(features_array)
            score = probabilities[1]

            if score >= self.relevant_threshold:
                analysis['list']['relevant'].append((score, list))
            else:
                analysis['list']['not_relevant'].append((score, list))

        return analysis
示例#2
0
def initialize():
    index = 0

    all_features = np.empty((data_size, feature_length))
    all_output_labels = np.empty(
        (data_size, size_of_output_labels_vector)
    )  #  loop through all words, call word_to_index[word] and assign it to np.zeros
    all_words = {}

    for filename in os.listdir(directory):

        if os.path.isdir(directory + "/" + filename):

            for filename2 in os.listdir(os.path.join(directory, filename)):

                if os.path.isdir(directory + "/" + filename + "/" + filename2):

                    for filename3 in os.listdir(
                            os.path.join(directory, filename, filename2)):

                        if filename3 == "cache-file":
                            with open(
                                    os.path.join(directory, filename,
                                                 filename2, filename3),
                                    'rd') as f:
                                for line in f:

                                    audioFilePath = os.path.join(
                                        directory, filename, filename2,
                                        line.split(' ')[0])
                                    word = line.split(' ')[1]
                                    startTime = float(line.split(' ')[2])
                                    endTime = float(line.split(' ')[3])

                                    with open(audioFilePath, 'rd') as a:
                                        timelinedWord = WordWithTimeline(
                                            word, startTime, endTime)

                                        try:
                                            all_features[
                                                index, :] = FeaturesExtractor.getFeaturesFFT(
                                                    timelinedWord,
                                                    audioFilePath,
                                                    feature_length)
                                            all_output_labels[
                                                index, :] = np.zeros(
                                                    size_of_output_labels_vector
                                                )

                                            all_words[index] = word

                                            all_output_labels[
                                                index, word_to_index[word]] = 1
                                            index += 1

                                        except ValueError:
                                            print("skipping word, all zeros")

                                        if index >= data_size:
                                            return all_output_labels, all_features, all_words
示例#3
0
def print_result(result, color):
    for score, node in result:

        # 1° column: score
        score = round(score, 2)

        # 2° column: text d
        summary_length = 60
        node_summary = node_text_summary(node, length=summary_length)
        node_summary = '"{}"'.format(node_summary)

        # 3° column: feature vector
        descriptor = DefaultFeatures.table if node.tag == "table" else DefaultFeatures.list
        selected = DefaultFeatures.table_selected if node.tag == "table" else DefaultFeatures.list_selected
        ft = FeaturesExtractor()
        features = ft.extract(node,
                              selected=selected,
                              features_descriptor=descriptor)
        features_array = ft.toArray(features)

        padding = " " * (summary_length - len(node_summary))

        print(with_color(score, color=color), node_summary, padding,
              str(list(features_array)))
示例#4
0
    for candidate in candidates:
        feature_vectors.append(
            features_extractor.extract_features(tomogram, candidate))
        # this sets each candidate's label
        labels.append(labeler.label(candidate))

    return (candidates, feature_vectors, labels)


#this is tuple of tuples of TiltedTemplates (each group has the same template_id)
templates = TemplateGenerator.generate_tilted_templates()
#save templates to files

candidate_selector = CandidateSelector.CandidateSelector(templates)
features_extractor = FeaturesExtractor.FeaturesExtractor(templates)

#Training

feature_vectors = []
#a label is a template_id, where 0 is junk
labels = []

criteria = (Candidate.fromTuple(1, 0, 10,
                                10), Candidate.fromTuple(1, 2, 27, 18),
            Candidate.fromTuple(0, 0, 10, 28))

for i in range(TRAINING_SET_SIZE):
    # configuration for tomogram generation
    #with a set composition
    tomogram = TomogramGenerator.generate_tomogram_with_given_candidates(
示例#5
0
            os.makedirs(videoFeatureDir)
        for featureName in featureNameListNew:
            videoFeatureList = []
            for i in range(0, 3):
                videoFeatureList.extend(
                    np.loadtxt(featureName + "_feature_" + str(i)))
            np.savetxt(videoFeatureDir + os.sep +
                       os.path.basename(featureName),
                       videoFeatureList,
                       newline=" ")


if __name__ == '__main__':
    starttime = datetime.datetime.now()
    fe.FeaturesExtractor(
        r"/home/sunbite/MFSSEL/keyframe_not_on_spark/",
        r"/home/sunbite/MFSSEL/features_not_on_spark/").featuresExtractor()

    # fe.FeaturesExtractor(
    #     r"/home/sunbite/MFSSEL/keyframe_not_on_spark/",
    #     r"/home/sunbite/MFSSEL/features_not_on_spark/").getAllVideoFeature()
    endtime = datetime.datetime.now()
    print(
        '----------------------------------------------------------------------------'
    )
    print(
        '----------------------------------------------------------------------------'
    )
    print(
        '-------------FeaturesExtractor Running time: %s Seconds--------------'
        % (endtime - starttime).seconds)
示例#6
0
# set random seed
seed(random_seed)
set_random_seed(random_seed)

### data load ###
logger.info('load data')
(x_train, y_train), (x_test, y_test) = load_data(dataset)

# create the training and test data with normal class only for training CAE's
x_train_normal, x_test_normal = get_normal_data(x_train, y_train, x_test,
                                                y_test, normal_class)

### make features_extractor model and extract features for training and test dataset images
logger.info('extract features')
if features_extractor == 'cae':
    features_train, features_test, featuresExtractTime = FeaturesExtractor.cae(
        cae_type, x_train_normal, x_test_normal, x_test)
else:
    featuresExtractTime = 0
    features_train, features_test = FeaturesExtractor.raw(
        x_train_normal, x_test)

### anomaly detection
logger.info('anomaly detection: calculate anomaly scores and auc')
if anomaly_detection == 'ocsvm':
    scores, anomalyDetectTime = AnomalyDetection.ocsvm(features_train,
                                                       features_test)
    labels_test = y_test.flatten() == normal_class
elif anomaly_detection == 'nnd':
    scores, anomalyDetectTime = AnomalyDetection.nnd(features_train,
                                                     features_test, k)
    labels_test = y_test.flatten() != normal_class
示例#7
0
                                else:
                                    if word not in word_to_index:
                                        word_to_index[word] = max(
                                            word_to_index.values()) + 1
                                        index_to_word[
                                            max(word_to_index.values()) +
                                            1] = word

                                startTime = float(line.split(' ')[2])
                                endTime = float(line.split(' ')[3])

                                with open(audioFilePath, 'rd') as a:
                                    timelinedWord = WordWithTimeline(
                                        word, startTime, endTime)

                                    features = FeaturesExtractor.getFeatures(
                                        timelinedWord, audioFilePath)
                                    #print(audioFilePath)
                                    print(word)
                                    #print("Start time " + str(startTime))
                                    #print("End time " + str(endTime))
                                    #print("Feature size: " + str(features.size))

                                    if word not in wordToIndex:
                                        wordToIndex[word] = wordIndex
                                        indexToWord[wordIndex] = word
                                        wordIndex += 1
                                        wordToIndexFile.write(word + "\n")

            #!!!!!!!!!!!!!!!!!!!!!!! Do SOMETING WITH FEATURES HERE !!!!!!!!!!!!!!!!!!!!!!!
            # They have a variable size so we probably need to do something about that
示例#8
0
 def __init__(self, relevant_threshold=0.8):
     self.fe = FeaturesExtractor()
     self.relevant_threshold = relevant_threshold
     self.tableClassifier = Classifier('models/table_classifier.h5')
     self.listClassifier = Classifier('models/list_classifier.h5')
示例#9
0
with open(os.path.join(directory, "cache-file"), 'rd') as f:

    while numOfWords > 0:
        line = f.readline()

        words[5 - numOfWords] = line.split(' ')[1]

        audioFilePath = os.path.join(directory, line.split(' ')[0])
        startTime = float(line.split(' ')[2])
        endTime = float(line.split(' ')[3])

        # with open(audioFilePath, 'rd') as a:
        timelinedWord = WordWithTimeline(
            line.split(' ')[1], startTime, endTime)
        features[5 - numOfWords, :] = FeaturesExtractor.getFeaturesFFT(
            timelinedWord, audioFilePath, feature_length)

        numOfWords -= 1

for i in range(5):
    outputs[i] = generateDicArray(words, words[i])

# print(words)
# print(outputs)
# print(features)

n_classes = 5

x = int(np.sqrt(feature_length))

layer_1 = Convolutional(input_shape=(x, x, 1),
                videoFeatureList.extend(
                    np.loadtxt(featureName + "_feature_" + str(i)))
            np.savetxt(videoFeatureDir + os.sep +
                       os.path.basename(featureName),
                       videoFeatureList,
                       newline=" ")


if __name__ == '__main__':
    starttime = datetime.datetime.now()
    # fe.FeaturesExtractor(
    #     r"/home/sunbite/keyframe/",
    #     r"/home/sunbite/features_new_1").featuresExtractor()

    fe.FeaturesExtractor(
        r"/home/sunbite/MFSSEL/keyframe/",
        r"/home/sunbite/MFSSEL/features_new_1/").getAllVideoFeature()
    endtime = datetime.datetime.now()
    print(
        '----------------------------------------------------------------------------'
    )
    print(
        '----------------------------------------------------------------------------'
    )
    print(
        '-------------FeaturesExtractor Running time: %s Seconds--------------'
        % (endtime - starttime).seconds)
    print(
        '----------------------------------------------------------------------------'
    )
    print(