示例#1
0
def predict(input_path, output_path, start_extract, end_extract):
    tmp_dir = 'tmp_files/'
    tmp_features = 'tmp.features'
    tmp_prob = 'tmp.prob'
    tmp_prediction = 'tmp.prediction'
    if not os.path.exists(input_path):
        print >> sys.stderr, "wav file does not exits"
        return

    length = utils.get_wav_file_length(input_path)
    feature_file = tmp_dir + tmp_features
    prob_file = tmp_dir + tmp_prob
    predict_file = tmp_dir + tmp_prediction

    # remove tmo dir if exists
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.mkdir(tmp_dir)

    print '\n1) Extracting features and classifying ...'
    extract_features(input_path, feature_file, start_extract, end_extract)
    run(feature_file, prob_file)
    print '\n3) Extract Durations ...'
    post_process(prob_file, predict_file)
    print '\n4) Writing TextGrid file to %s ...' % output_path
    create_text_grid(predict_file, output_path, length, float(start_extract))
示例#2
0
def main(argv):
    del argv  # Unused.
    if FLAGS.evaluate:
        extract_features(FLAGS)
    else:
        wandb.init(config=FLAGS, sync_tensorboard=True)
        set_up_train(FLAGS)
示例#3
0
def main():
    """These are the main training settings. Set each before running
    this file."""

    if (len(sys.argv) == 5):
        seq_length = int(sys.argv[1])
        class_limit = int(sys.argv[2])
        image_height = int(sys.argv[3])
        image_width = int(sys.argv[4])
    else:
        print ("Usage: python train.py sequence_length class_limit image_height image_width")
        print ("Example: python train.py 75 2 720 1280")
        exit (1)

    sequences_dir = os.path.join('data', 'sequences')
    if not os.path.exists(sequences_dir):
        os.mkdir(sequences_dir)

    checkpoints_dir = os.path.join('data', 'checkpoints')
    if not os.path.exists(checkpoints_dir):
        os.mkdir(checkpoints_dir)

    # model can be only 'lstm'
    model = 'lstm'
    saved_model = None  # None or weights file
    load_to_memory = False # pre-load the sequences into memory
    batch_size = 10
    nb_epoch = 50
    data_type = 'features'
    image_shape = (image_height, image_width, 3)

    extract_features(seq_length=seq_length, class_limit=class_limit, image_shape=image_shape)
    train(data_type, seq_length, model, saved_model=saved_model,
          class_limit=class_limit, image_shape=image_shape,
          load_to_memory=load_to_memory, batch_size=batch_size, nb_epoch=nb_epoch)
示例#4
0
def predict(input_path, output_path, start_extract, end_extract):
    tmp_dir = 'tmp_files/'
    tmp_features = 'tmp.features'
    tmp_prob = 'tmp.prob'
    tmp_prediction = 'tmp.prediction'
    if not os.path.exists(input_path):
        print >>sys.stderr, "wav file does not exits"
        return

    length = utils.get_wav_file_length(input_path)
    feature_file = tmp_dir+tmp_features
    prob_file = tmp_dir+tmp_prob
    predict_file = tmp_dir+tmp_prediction

    # remove tmo dir if exists
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.mkdir(tmp_dir)

    print '\n1) Extracting features and classifying ...'
    extract_features(input_path, feature_file, start_extract, end_extract)
    run(feature_file, prob_file)
    print '\n3) Extract Durations ...'
    post_process(prob_file, predict_file)
    print '\n4) Writing TextGrid file to %s ...' % output_path
    create_text_grid(predict_file, output_path, length, float(start_extract))
示例#5
0
def main(argv):
    del argv  # Unused.
    if FLAGS.evaluate:
        extract_features(FLAGS)
    else:
        wandb.init(config=FLAGS,
                   sync_tensorboard=True,
                   dir="/dfs/scratch2/prabhat8/cs236g/wandb")
        set_up_train(FLAGS)
示例#6
0
def main():
    """ Training and test settings for the algorithm."""
    model_name = 'lstm'
    saved_model = None
    cls_lmt = None
    batch_size = 32
    prompt = "Enter the sequence length"
    seq_length = np.int64(raw_input(prompt))
    if seq_length == '':
        seq_length = None
    prompt = "Enter the maximum frame length"
    max_frames = np.int64(raw_input(prompt))
    if max_frames == '':
        max_frames = None
        raise ValueError("Enter Valid interger for max frames")
    prompt = "Enter the model_name from the list \n 1.LSTM \n 2.CNN_LSTM"
    model_name = np.int64(raw_input(prompt))
    if model_name == '':
        model_name = None
    if model_name == 'LSTM':
        data_type = 'features'
        img_shape = None
        # Feature Extraction
        extract_features(seq_length, max_frames, abspath=None)
    elif model_name == 'CNN_LSTM':
        data_type = 'images'
        img_shape = (100, 100, 3)
    else:
        raise ValueError(
            "Invalid Model Selected. Select from List given in the options")

    # Training the algorithm
    train(seq_length,
          model_name,
          saved_model,
          data_type,
          cls_lmt=None,
          img_shape=img_shape,
          batch_size=32,
          no_epoch=100)
    # Testing the algorithm
    saved_model = raw_input(
        "Enter the absolute path for the model saved after training")
    acc = test(model_name,
               saved_model,
               data_type,
               seq_length,
               cls_lmt=None,
               img_shape=None,
               batch_size=30)
    with open('Testfile.csv', 'a+') as f:
        op = model_name + ',' + str(seq_length) + ',' + str(
            max_frames) + ',' + saved_model + ',' + acc
        f.write(op)
    return
示例#7
0
def train():
    if request.method == 'POST':
        f = request.files.getlist("my_file[]")
        for i in f :
            i.save('raw/'+i.filename)
    makeinterim()
    makeprocessed()
    extract_features ()
    report = train_pred()
    df = pd.DataFrame(report).transpose()
    return render_template('classification_report.html', tables=[df.to_html()])
示例#8
0
def main():

    foldername = 'data'
    X, X_test_kaggle, y, groups, lenc = load_data(foldername)
    X, X_test_kaggle = extract_features(X), extract_features(X_test_kaggle)

    classifiers = [
        LinearDiscriminantAnalysis(),
        SVC(kernel='linear', probability=True),
        SVC(kernel='rbf', probability=True),
        LogisticRegression(),
        RandomForestClassifier(n_estimators=1000, max_depth=4),
        KNeighborsClassifier(),
        ExtraTreesClassifier(n_estimators=1000, max_depth=4)
    ]

    names = [
        'LDA', 'SVC(linear)', 'SVC(rbf)', 'Logistic Regression',
        'Random Forest', 'KNeighbors', 'ExtraTrees'
    ]
    scores_by_clf = defaultdict(list)

    rs = GroupShuffleSplit(n_splits=100, test_size=0.2)
    for trindex, tsindex in rs.split(X, y, groups):
        X_train, y_train = X[trindex, :], y[trindex]
        X_test, y_test = X[tsindex, :], y[tsindex]

        print("Training set has classes: ", np.unique(y_train))
        amount = []
        for i in range(len(np.unique(y_train))):
            amount.append(len(np.argwhere(y_train == np.unique(y_train)[i])))
        print(amount)

        print("Testing set has classes: ", np.unique(y_test))
        amount = []
        for i in range(len(np.unique(y_test))):
            amount.append(len(np.argwhere(y_test == np.unique(y_test)[i])))
        print(amount)

        for i in range(len(classifiers)):
            clf = classifiers[i]
            pred, score, clf, proba = classify(clf,
                                               X_train,
                                               y_train,
                                               X_test,
                                               y_test,
                                               groups=groups)
            print(np.unique(pred))
            print(names[i], 'score: %.3f \n' % score)
            scores_by_clf[names[i]].append(score)

    for clfname in scores_by_clf.keys():
        print(clfname, np.mean(scores_by_clf[clfname]))
def main(args):
    print('===> args:\n', args)
    config_json = args.config_json
    image_dir = args.image_dir
    save_dir = args.save_dir
    image_list_file = args.image_list_file
    gpu_id = args.gpu
    extract_features(config_json,
                     save_dir,
                     image_list_file,
                     image_dir,
                     gpu_id=gpu_id)
示例#10
0
def build_features(scale_features=False):
    film_list = []
    feat_list = []
    qual_list = []
    count = 0
    with open('imsdb_ratings_processed.csv', 'rb') as csvfile:
        __ = csvfile.readline()
        reader = csv.reader(csvfile)
        for row in reader:
            count += 1
            print "Film No.: " + str(count)
            #if count == 10:
            #	break
            #if get_status(row) == "Good" or get_status(row) == "Bad":
            file_name = file_base + row[0] + ".txt"
            print file_name
            scenes = format_script(file_name)
            if scenes is None:
                print "No file or formatting error"
                continue
            print "Num scenes: " + str(len(scenes))
            if len(scenes) < 5:
                print "Skipping..."
                continue
            # Segment level features
            chunks = script_to_n_chunks(scenes)
            features = {}
            for idx, chunk in enumerate(chunks):
                # change input parameter to scenes for extract features
                try:
                    chunk_features = extract_features(chunk, idx + 1)
                except:
                    try:
                        chunk_features = extract_features(chunk, idx + 1)
                    except:
                        print "Failed to extract features"
                        continue
                features.update(chunk_features)
            # Full script features
            script_summary_features = get_summary_features(scenes, row[0])
            final_features = {}
            final_features.update(features)
            final_features.update(script_summary_features)
            feat_list.append(final_features)
            #if get_status(row) == "Good":
            qual_list.append(row[4])
            #else:
            #qual_list.append(0)
            film_list.append(row[0])
    return film_list, feat_list, qual_list
示例#11
0
def build_features(scale_features = False):
	film_list = []
	feat_list = [] 
	qual_list = []
	count = 0
	with open('imsdb_ratings_processed.csv', 'rb') as csvfile:
		__ = csvfile.readline()
		reader = csv.reader(csvfile)
		for row in reader:
			count += 1
			print "Film No.: " + str(count)
			#if count == 10:
			#	break
			#if get_status(row) == "Good" or get_status(row) == "Bad":
			file_name = file_base + row[0] + ".txt"
			print file_name
			scenes = format_script(file_name)
			if scenes is None:
				print "No file or formatting error"
				continue
			print "Num scenes: " + str(len(scenes))
			if len(scenes) < 5:
				print "Skipping..."
				continue
			# Segment level features
			chunks = script_to_n_chunks(scenes)
			features = {}
			for idx, chunk in enumerate(chunks):
			# change input parameter to scenes for extract features
				try:
					chunk_features = extract_features(chunk, idx + 1)
				except:
					try:
						chunk_features = extract_features(chunk, idx + 1)
					except:
						print "Failed to extract features"
						continue
				features.update(chunk_features)
			# Full script features
			script_summary_features = get_summary_features(scenes, row[0])
			final_features = {}
			final_features.update(features)
			final_features.update(script_summary_features)
			feat_list.append(final_features)
			#if get_status(row) == "Good":
			qual_list.append(row[4])
			#else:
			#qual_list.append(0)
			film_list.append(row[0])
	return film_list, feat_list, qual_list
示例#12
0
def training(data, classifier, one_vs_all, apply_pca, tag_dict):
    ind_tags_phrases = individual_phrase_tags(data, tag_dict)
    ind_phrases, ind_tags, phrase_position = [], [], []

    for i in ind_tags_phrases:
        ind_phrases.append(i[0].split(',')[1])
        ind_tags.append(i[0].split(',')[0])
        phrase_position.append(i[1])
    print('train', len(ind_phrases))

    train_features = extract_features(ind_phrases, phrase_position, 'training')
    write_features_labels('./', ind_phrases, train_features, ind_tags,
                          'train_features')
    pca, scaler = '', ''
    if apply_pca == 'y':
        scaler = StandardScaler()
        scaler.fit(train_features)
        train_features = scaler.transform(train_features)
        pca = PCA(0.95)
        pca.fit(train_features)
        train_features = pca.transform(train_features)
    model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100),
                                n_jobs=-1)
    model.fit(train_features, ind_tags)
    pred_tags = model.predict(train_features)
    accuracy = metrics.accuracy_score(ind_tags, pred_tags)
    return model, accuracy, pca, scaler
示例#13
0
    def evaluate_windows_of_size(self, windows, window_size):
        X = []
        for w in windows:
            window_yuv = self.cropped_frame_yuv[int(w.y1):int(w.y2),
                                                int(w.x1):int(w.x2)]
            if w.width() != window_size or w.height() != window_size:
                window_yuv = cv2.resize(window_yuv, (window_size, window_size))

            ppc = 8 if self.use_hires_classifier else 16
            X.append(extract_features(window_yuv, window_size, ppc=ppc))
        X = np.array(X)

        windows = np.array(windows)
        normalized_feature_vector = self.scaler[window_size].transform(X)
        score = self.classifier[window_size].predict(normalized_feature_vector)
        pos_window_indexes = np.where(score == 1.0)[0]
        pos_windows = windows[pos_window_indexes]
        pos_window_scores = score[pos_window_indexes]

        result = []
        self.evaluated_windows.extend(windows)
        for r, score in zip(pos_windows, pos_window_scores):
            result.append((r, score))
            if self.save_false_positives and self.is_false_positive_candidate(
                    r):
                window_img = crop_img(self.cropped_frame, r.x1, r.y1, r.x2,
                                      r.y2)
                save_img(
                    window_img, "%s/%d/%04d-%04d" %
                    (self.false_positive_dir_name, window_size,
                     self.frame_count, self.false_positive_count))
                self.false_positive_count += 1

        return result
def BoltMotionObjToFeatureObj(all_bolt_data):
    """
    Pull out PCA components from all data

    For each object - pull out features and store in feature_obj
    with the same structure as all_bolt_data
   
        Dictionary - "tap", "slide", "slow_slide", 
                     "thermal_hold", "squeeze"

    """
    # DO PCA Calculations here 
    
    # Store in feature class object
    all_features_obj_dict = dict();

    for motion_name in all_bolt_data:
        trial_list = all_bolt_data.get(motion_name)
        print motion_name

        feature_list = list()
        # For all objects
        for trial in trial_list:
            
            bolt_feature_obj = extract_features.extract_features(trial)
            
            feature_list.append(bolt_feature_obj)

        # Store all of the objects away
        all_features_obj_dict[motion_name] = feature_list
            
    return all_features_obj_dict        
    def compute_probability_vector(self, bolt_obj):
        if bolt_obj.state  == bolt_obj.TAP:
            # Store results as they come in
            self.adjective_vectors = dict() 
            self.all_motion_results = dict()
        
        # Store dictionary of strings
        self.state_string = {bolt_obj.DISABLED:'disabled',
                    bolt_obj.THERMAL_HOLD:'thermal_hold',
                    bolt_obj.SLIDE:'slide',
                    bolt_obj.SQUEEZE:'squeeze',
                    bolt_obj.TAP:'tap',
                    bolt_obj.DONE:'done',
                    bolt_obj.SLIDE_FAST:'slide_fast',
                    bolt_obj.CENTER_GRIPPER:'center_gripper'
                    }   
       
        
        current_motion = self.state_string[bolt_obj.state] 
       
        # Create feature vector 
        self.bolt_object = bolt_obj 
        utilities.normalize_data(self.bolt_object)
        self.bolt_feature_object = extract_features.extract_features(self.bolt_object, self.pca_model[current_motion]) 

        # Create a dictionary to store the results in
        for adj in self.all_classifiers:
            results, prob = utilities.compute_adjective_probability_score(self.all_classifiers[adj], self.bolt_feature_object, self.feature_list, adj, self.scaler_dict)
            
            # Store off adjective probabilities for ensemble 
            if adj in self.adjective_vectors:
                pass 
            else:
                self.adjective_vectors[adj] = list()
           
            self.adjective_vectors[adj].append(prob)
           
            # Store classifier score based on best motion
            best_motion = self.best_motion_dict[adj][1]
            if current_motion  == best_motion:
                rospy.loginfo("Best Motion is: %s" % best_motion)
                self.all_motion_results[adj] = results
        
        print len(self.adjective_vectors[adj])
        if len(self.adjective_vectors[adj]) == 5:
            ensembled_results = dict() 
            print self.adjective_vectors 
            #for adj in self.adjective_vectors: 
            #    ensembled_results[adj] = self.ensemble_classifiers[adj].predict(self.adjective_vectors[adj])

            # Store off the adjectives that returned true
            adjectives_found = []
            for adj in self.all_motion_results:
                if self.all_motion_results[adj] == 1:
                    adjectives_found.append(adj)

            print "Results from max classification"
            print self.all_motion_results
            print str(adjectives_found) 
            self.adjectives_pub.publish(str(adjectives_found))
def BoltMotionObjToFeatureObj(all_bolt_data, electrode_pca_dict):
    """ 

    For each object - pull out features and store in feature_obj
    with the same structure as all_bolt_data
   
        Dictionary - "tap", "slide", "slow_slide", 
                     "thermal_hold", "squeeze"

    """

    # Store in feature class object
    all_features_obj_dict = dict();

    for motion_name in all_bolt_data:
        trial_list = all_bolt_data.get(motion_name)
        print motion_name

        feature_list = list()
        # For all objects
        for trial in trial_list:

            bolt_feature_obj = extract_features.extract_features(trial, electrode_pca_dict[motion_name])

            feature_list.append(bolt_feature_obj)

        # Store all of the objects away
        all_features_obj_dict[motion_name] = feature_list

    return all_features_obj_dict
示例#17
0
def cascaded_retrieval(queries, ord_features, feature_dict, k_list,
                       metric_dict):
    queries = preprocess_sketches(queries)

    results = []

    for query in queries:
        img_indices = np.arange(len(feature_dict[ord_features[0]]))
        for i, feature in enumerate(ord_features):
            sketch_features = extract_features(feature, query)
            if feature != "sift":
                image_features = np.asarray(feature_dict[feature])
                image_features = image_features[img_indices]
                sketch_features = np.array(sketch_features).reshape(1, -1)
                # print(f"Image: {np.array(image_features).shape}")
                # print(f"Sketch: {np.array(sketch_features).shape}")
            else:
                image_features = np.asarray(feature_dict[feature])
                image_features = [image_features[idx] for idx in img_indices]
                sketch_features = [sketch_features]
            distances = compute_distances(image_features, sketch_features,
                                          metric_dict[feature])
            top_results = get_top_results(k_list[i], distances)
            img_indices = img_indices[top_results[0]]
        results.append(img_indices)
    return results
def train_model():
    for files in file_paths:
        # Each speaker will have 1 features array
        features = np.asarray(())
        for filepath in files:
            print("Training: " + filepath)
            try:
                sr, audio = read(filepath)
                vector = extract_features(audio, sr)
            except Exception as e:
                print(e)
                continue

            if features.size == 0:
                features = vector
            else:
                try:
                    features = np.vstack((features, vector))
                except:
                    print("ValueError: Shape does not match")

        # gmm
        gmm = GMM(n_components=16,
                  max_iter=200,
                  covariance_type='diag',
                  n_init=3)
        gmm.fit(features)

        # export trained model
        picklefile = model + os.path.basename(filepath).split('_')[0] + ".gmm"
        with open(picklefile, 'wb') as gmm_file:
            pickle.dump(gmm, gmm_file)

        print('successfully modeling for speaker:', picklefile,
              " with data point = ", features.shape)
示例#19
0
def process_file(img):
    """process single image, extract features and emotions
        the process create the user model as well.
    Args:
        img (EmotionalImage): image data
    """
    assert isinstance(img, Image)
    if str(img.name).find('json') > -1:
        return

    user = get_user(os.path.join(img.path, 'meta.json'))
    filePath = os.path.join(img.path, img.name)
    # logging.info("---------------Processsing----------------", img.name)
    print("---------------Processsing----------------", img.name)

    try:
        features = extract_features(filePath)
    except:
        logging.exception('Somthing went wrong with feature extraction')
        return

    try:
        emotions = predict_emotions(features)
    except:
        logging.exception('Somthing went wrong with emotions extraction')
        return

    uuid1 = uuid.uuid4()
    emImage = EmotionalImage(uuid1, img.name, img.path, features, emotions, "",
                             "", "")

    # TODO: fix that, currently add one image at a time, not a functional issue but can be imroved.
    user.images.append(emImage)
    user.save()
示例#20
0
def compare_features(filepath):
    bboxes, face_features = extract_features(filepath)
    if len(bboxes) == 0:
        return []
    names, allfeatures = load_features()
    results = []
    for bbox, feature in zip(bboxes, face_features):
        min = sys.float_info.max
        max = 0.0
        nearest = None
        furthest = None
        for n, f in zip(names, allfeatures):
            dist = scipy.spatial.distance.cosine(feature, f)
            if dist < min:
                nearest = n
                min = dist
            if dist > max:
                furthest = n
                max = dist
        results.append({
            'face': bbox,
            'nearest': {
                'name': nearest,
                'distance': min
            },
            'furthest': {
                'name': furthest,
                'distance': max
            }
        })

    return results
def BoltMotionObjToFeatureObj(all_bolt_data, electrode_pca_dict):
    """

    For each object - pull out features and store in feature_obj
    with the same structure as all_bolt_data
   
        Dictionary - "tap", "slide", "slow_slide", 
                     "thermal_hold", "squeeze"

    """

    # Store in feature class object
    all_features_obj_dict = dict()

    for motion_name in all_bolt_data:
        trial_list = all_bolt_data.get(motion_name)
        print motion_name

        feature_list = list()
        # For all objects
        for trial in trial_list:

            bolt_feature_obj = extract_features.extract_features(
                trial, electrode_pca_dict[motion_name])

            feature_list.append(bolt_feature_obj)

        # Store all of the objects away
        all_features_obj_dict[motion_name] = feature_list

    return all_features_obj_dict
示例#22
0
def train(data_folder=TRAIN_FOLDER,
          labels_file=LABELS_FILE,
          model_prefix=MODEL_PREFIX,
          cdf_prediction=PREDICT_CDF,
          parameters=PARAMETERS):
    """Train ensemble."""

    random.seed(SEED)
    _, sys_lab, dia_lab, features = \
        extract_features.extract_features(data_folder, labels_file)
    assert len(sys_lab) == len(dia_lab) == features.shape[0]

    print 'total data', features.shape[0]
    print 'num features', features.shape[1]

    random.seed(SEED)
    test_prop = TEST_PROP
    test_size = int(features.shape[0] * test_prop)
    shuffled_indices = random.shuffle(range(features.shape[0]))

    test_sys_lab = sys_lab[:test_size]
    test_dia_lab = dia_lab[:test_size]
    test_features = features[:test_size]

    train_sys_lab = sys_lab[test_size:]
    train_dia_lab = dia_lab[test_size:]
    train_features = features[test_size:]

    print 'train size', train_features.shape[0]
    print 'test size', test_features.shape[0]

    systole_prefix = '%s_sys' % model_prefix
    diastole_prefix = '%s_dia' % model_prefix

    print 'training systole model'
    _train(train_features, train_sys_lab, test_features, test_sys_lab,
           model_prefix=systole_prefix,
           cdf_prediction=cdf_prediction,
           parameters=parameters)

    print 'training diastole model'
    _train(train_features, train_dia_lab, test_features, test_dia_lab,
           model_prefix=diastole_prefix,
           cdf_prediction=cdf_prediction,
           parameters=parameters)

    print 'final systole evaluation on test set'
    sys_crps = evaluate(test_features, test_sys_lab,
                        model_prefix=systole_prefix,
                        cdf_prediction=cdf_prediction)

    print 'final diastole evaluation on test set'
    dia_crps = evaluate(test_features, test_dia_lab,
                        model_prefix=diastole_prefix,
                        cdf_prediction=cdf_prediction)

    print 'test sys crps', sys_crps
    print 'test dia crps', dia_crps
    print 'overall test crps', (sys_crps + dia_crps) * 0.5
示例#23
0
 def evaluate_window(self, window):
     window_size = window.height()
     window_yuv = self.cropped_frame_yuv[int(window.y1):int(window.y2),
                                         int(window.x1):int(window.x2)]
     X = np.array(extract_features(window_yuv, window_size))
     normalized_feature_vector = self.scaler[window_size].transform(X)
     return self.classifier[window_size].predict(
         normalized_feature_vector)[0]
示例#24
0
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    global quota
    global clusterizer
    global df
    global classificator
    global toexist
    global standartizator
    quota = []
    y = np.empty(1000)
    threshold = 90
    nclusters = 15
    df = pd.DataFrame()
    df = df.append(exf.extract_features(QLINK_URLS), ignore_index=True)
    df = df.append(exf.extract_features(UNKNOWN_URLS), ignore_index=True)

    cnts = df.count()
    df = df.fillna(0)
    toexist = []
    for i in xrange(len(cnts)):
        if cnts[i] > threshold:
            toexist.append(df.columns[i])
    y[:500] = 1
    y[500:] = 0
    X = df[toexist].values

    # X1 = TSNE().fit_transform(X)

    # plt.scatter(X1[:, 0], X1[:, 1], c=y*8, cmap=plt.cm.get_cmap("jet", 10), s=1)
    # plt.colorbar(ticks=range(10))
    # plt.clim(-0.5, 9.5)
    # plt.show()

    standartizator = StandardScaler()
    X = standartizator.fit_transform(X)
    clusterizer = KMeans(n_clusters=nclusters)
    clusterizer.fit(X)
    #classificator = LDA(solver='lsqr').fit(X, y)
    #classificator = SVC()
    classificator = KNeighborsClassifier()
    classificator.fit(X, y)
    qlInCluster = []

    for i in xrange(nclusters):
        qlInCluster.append(sum(y[clusterizer.labels_ == i]))
        quota.append(90 * qlInCluster[i] + QUOTA / 100)
def daily_batch():
    print "**** [Step 1] Clean old data and prepare directories ****"
    try:
        shutil.rmtree('data/')
        os.makedirs('data/')
        os.makedirs('output/')
    except:
        pass
    print "**** [Step 2] Crawl news articles ****"
    grd.get_raw_data()
    print "**** [Step 3] Compute features for each news articles ****"
    ef.extract_features(mode="batch")
    print "**** [Step 4] Compute the related news ****"
    print "(may take long time)"
    fv, id_list = grn.get_feature_vectors(mode="batch")
    grn.ANN(fv, id_list)
    print "**** [Step 5] Loading data to Redis ****"
    ftr.load_data(mode="batch")
    print "DONE!"
示例#26
0
    def input_fn():
        # Load and parse dataset
        dataset = tf.data.TFRecordDataset(filename, compression_type='GZIP')
        corpus = dataset.map(exp_TFR.decode, num_parallel_calls=8)

        corpus = corpus.shuffle(5000000)

        # Build the dictionary
        # Extract the top 60000 most common words to include in our embedding vector
        vocab_file_path = "Dataset/Vocabulary/vocabulary.txt"
        vocab_size = 60000

        # Gather together all the unique words and index them with a unique integer value
        # Loop through every word in the dataset and assign it to the unique integer word identified.
        # Any words not within the top 60000 most common words will be marked with "-1" and replace with "UNK" token

        # Load the dictionary populated by keys corresponding to each unique word
        table = tf.contrib.lookup.index_table_from_file(
            vocabulary_file=vocab_file_path,
            vocab_size=vocab_size,
            key_column_index=1,
            delimiter=' ')

        # Create a reverse_table that allows us to look up a word based on its unique integer identifier,
        # rather than looking up the identifier based on the word.
        # reverse_table = tf.contrib.lookup.index_to_string_table_from_file(vocabulary_file=vocab_file_path,
        #                                                                   vocab_size=vocab_size,
        #                                                                   value_column_index=1,
        #                                                                   delimiter=' ')

        # Load ocean dictionary
        ocean_dict_file_path = "Dataset/Vocabulary/ocean_dict_filtered.txt"
        ocean_dict_size = 634  # 636 before (deleted 2 adjective)

        # Ocean lookup-table
        ocean_table = tf.contrib.lookup.index_table_from_file(
            vocabulary_file=ocean_dict_file_path,
            vocab_size=ocean_dict_size,
            key_column_index=0,
            delimiter='\t')

        # Extract labels and features and generate dataset
        dataset = corpus.map(
            lambda ids, text: extract_features(ids, text, table, ocean_table),
            num_parallel_calls=8)

        # Delete all the sentences without adjective
        # dataset = dataset.filter(lambda features, ocean_vector: tf.reduce_all(tf.is_finite(ocean_vector)))
        dataset = dataset.filter(lambda features, ocean_value: tf.reduce_all(
            tf.is_finite(ocean_value)))

        dataset = dataset.batch(batch_size=500)

        return dataset
示例#27
0
def main():
    """These are the main training settings. Set each before running
    this file."""
    #os.environ['CUDA_VISIBLE_DEVICES'] = "0"
    start = time.time()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"


    if (len(sys.argv) == 5):
        seq_length = int(sys.argv[1])
        class_limit = int(sys.argv[2])
        image_height = int(sys.argv[3])
        image_width = int(sys.argv[4])
    else:
        print ("Usage: python train.py sequence_length class_limit image_height image_width")
        print ("Example: python train.py 75 2 720 1280")
        exit (1)

    sequences_dir = os.path.join('data', 'sequences')
    if not os.path.exists(sequences_dir):
        os.mkdir(sequences_dir)

    checkpoints_dir = os.path.join('data', 'checkpoints')
    if not os.path.exists(checkpoints_dir):
        os.mkdir(checkpoints_dir)

    # model can be only 'lstm'
    model = 'lstm'
    saved_model = None  # None or weights file
    load_to_memory = False # pre-load the sequences into memory
    batch_size = 16
    nb_epoch = 50
    data_type = 'features'
    image_shape = (image_height, image_width, 3)

    extract_features(seq_length=seq_length, class_limit=class_limit, image_shape=image_shape)
    train(data_type, seq_length, model, saved_model=saved_model,
          class_limit=class_limit, image_shape=image_shape,
          load_to_memory=load_to_memory, batch_size=batch_size, nb_epoch=nb_epoch)
    print('time required {:0.3f}'.format(time.time() - start))
示例#28
0
def main():
    
    # Load the data, get features, scale data
    foldername = 'data'
    X, X_test_kaggle, y, groups, lenc = load_data(foldername)
    X, X_test_kaggle = extract_features(X), extract_features(X_test_kaggle)
    print('the shape of X: ' + str(X.shape))
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_test_kaggle = scaler.transform(X_test_kaggle)

    classifiers = [RandomForestClassifier(n_estimators=500, max_depth=4)]
    predictions, scores, classifiers, probabilities = classify_multi(classifiers, X, y, X_test_kaggle)
    print(predictions)

    # Submission stuff
    labels = lenc.inverse_transform(predictions[:,0].astype(int)) 
    with open('results/submission.csv', "w") as fp:
        fp.write("# Id,Surface\n")
        for i in range(len(labels)):
            fp.write("%d,%s\n"%(i, labels[i]))
示例#29
0
def predict(wav_path, textgrid_path, start_extract, end_extract):
    tmp_feature_file = generate_tmp_filename('features')
    tmp_prob_file = generate_tmp_filename('prob')
    tmp_predict_file = generate_tmp_filename('prediction')
    if not os.path.exists(wav_path):
        print >> sys.stderr, "wav file %s does not exits" % wav_path
        return
    length = utils.get_wav_file_length(wav_path)

    print '\n1) Extracting features and classifying ...'
    extract_features(wav_path, tmp_feature_file, start_extract, end_extract)
    run(tmp_feature_file, tmp_prob_file)
    print '\n3) Extract Durations ...'
    post_process(tmp_prob_file, tmp_predict_file)
    print '\n4) Writing TextGrid file to %s ...' % textgrid_path
    create_text_grid(tmp_predict_file, textgrid_path, length, float(start_extract))

    # remove leftovers
    os.remove(tmp_feature_file)
    os.remove(tmp_prob_file)
    os.remove(tmp_predict_file)
示例#30
0
def main():
    import sys

    classifier = cv2.SVM()
    classifier.load(sys.argv[1])

    im = cv2.imread(sys.argv[2])
    im = cv2.GaussianBlur(im, (3, 3), 0)
    imshow_large(__file__, im)
    cv2.waitKey()

    unrotated = unrotate(im)
    #
    # Check if image is right way up, correct otherwise
    #
    imshow_large(__file__, unrotated)
    key = cv2.waitKey()
    if key & 0xff == ord("r"):
        unrotated = cv2.flip(cv2.flip(unrotated, 0), 1)
        imshow_large(__file__, unrotated)
        cv2.waitKey()

    binarized = binarize(unrotated)
    rois = get_rois(binarized)
    results = {}

    grayscale = cv2.cvtColor(unrotated, cv.CV_BGR2GRAY)

    for (x, y, width, height) in rois:
        roi = grayscale[y:y + height, x:x + width]
        vec = extract_features(roi)
        label = classifier.predict(vec)
        results[(x, y, width, height)] = "01234567890X"[int(label)]

    scale = SCREEN_HEIGHT / unrotated.shape[0]
    unrotated = cv2.cvtColor(grayscale, cv.CV_GRAY2BGR)
    scaled = cv2.resize(unrotated, (0, 0), fx=scale, fy=scale)

    for roi in rois:
        x = int(roi[0] * scale)
        y = int(roi[1] * scale)
        width = int(roi[2] * scale)
        height = int(roi[3] * scale)
        cv2.rectangle(scaled, (x, y), (x + width, y + height), (0, 255, 0, 0),
                      1)
        if results[roi] == "X":
            continue
        cv2.putText(scaled, results[roi], (x, y), cv.CV_FONT_HERSHEY_SIMPLEX,
                    1, (0, 255, 0, 0))

    cv2.imshow(__file__, scaled)
    cv2.waitKey()
def test_classifier_parameters(classifier = 'classifier_name', infile = 'features.npz', subcats=True, norm = False):
	"""
	main function to determine number of features to be used for classification and classifier hyperparameters
		classifier - classifier to be used, Naive Bayes or LDA
		infile - .csv file with the extracted features
		norm - Boolean parameter whether to normalize the Naive Bayes for unbalanced class sizes
	"""
	# extract numpy arrays, lists and dictionaries from the features.npz
	extract_features(subcats=subcats)
	features_file = np.load(infile)
	features, featurenames, categoryids, categories = features_file['features'], features_file['featurenames'], \
													features_file['categoryids'], features_file['categories'].item()

	labels = categoryids[0,:]
	features = features.T
	categoryids = categoryids.T
	if classifier == 'Naive Bayes':
		# if True, do the normalization for unbalanced class sizes
		if norm:
			features = normalize(features, categoryids, categories)
		classify('nb', features, labels, categories)
	else:
		classify('lda', features, labels, categories)
示例#32
0
def main():
    import sys

    classifier = cv2.SVM()
    classifier.load(sys.argv[1])

    im = cv2.imread(sys.argv[2])
    im = cv2.GaussianBlur(im, (3,3), 0)
    imshow_large(__file__, im)
    cv2.waitKey()
    
    unrotated = unrotate(im)
    #
    # Check if image is right way up, correct otherwise
    #
    imshow_large(__file__, unrotated)
    key = cv2.waitKey()
    if key & 0xff == ord("r"):
        unrotated = cv2.flip(cv2.flip(unrotated, 0), 1)
        imshow_large(__file__, unrotated)
        cv2.waitKey()

    binarized = binarize(unrotated)
    rois = get_rois(binarized)
    results = {}

    grayscale = cv2.cvtColor(unrotated, cv.CV_BGR2GRAY)
    
    for (x,y,width,height) in rois:
        roi = grayscale[y:y+height,x:x+width]
        vec = extract_features(roi)
        label = classifier.predict(vec)
        results[(x,y,width,height)] = "01234567890X"[int(label)]

    scale = SCREEN_HEIGHT/unrotated.shape[0]
    unrotated = cv2.cvtColor(grayscale, cv.CV_GRAY2BGR)
    scaled = cv2.resize(unrotated, (0,0), fx=scale, fy=scale)
    
    for roi in rois:
        x = int(roi[0]*scale)
        y = int(roi[1]*scale)
        width = int(roi[2]*scale)
        height = int(roi[3]*scale)            
        cv2.rectangle(scaled, (x,y), (x+width, y+height), (0,255,0,0), 1)
        if results[roi] == "X":
            continue
        cv2.putText(scaled, results[roi], (x, y), cv.CV_FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0, 0))

    cv2.imshow(__file__, scaled)
    cv2.waitKey()
示例#33
0
def testing(data, model, pca, scaler, apply_pca):
    ind_tags_phrases = individual_phrase_tags(data)
    ind_phrases, ind_tags, phrase_position = [], [], []
    for i in ind_tags_phrases:
        ind_phrases.append(i[0].split(',')[1])
        ind_tags.append(i[0].split(',')[0])
        phrase_position.append(i[1])
    test_features = extract_features(ind_phrases, phrase_position, 'testing')
    if apply_pca == 'y':
        test_features = scaler.transform(test_features)
        test_features = pca.transform(test_features)

    pred_tags = model.predict(test_features)
    accuracy = metrics.accuracy_score(ind_tags, pred_tags)
    return accuracy
示例#34
0
def fetch_url(url):
    global quota
    global clusterizer
    global df
    global standartizator
    d = exf.extract_features([url], toexist)[0].values()
    d = standartizator.transform([d])[0]
    cls = clusterizer.predict([d])[0]
    if classificator.predict_proba([d])[0][1] > 0.7:
        quota[cls] -= 1
        return True
    if quota[cls] > 0:
        quota[cls] -= 1
        return True
    else:
        return False
 def get_test_data(self):
     # loads the features array
     file_list = os.listdir(os.path.join(self.test_path, 'video'))
     # with open(os.path.join(self.test_path, 'testing.txt')) as testing_file:
     # lines = testing_file.readlines()
     # file_name = lines[self.num].strip()
     file_name = file_list[self.num]
     path = os.path.join(self.test_path, 'feat', file_name + '.npy')
     if os.path.exists(path):
         f = np.load(path)
     else:
         model = extract_features.model_cnn_load()
         f = extract_features.extract_features(file_name, model)
     if self.num < len(file_list):
         self.num += 1
     else:
         self.num = 0
     return f, file_name
示例#36
0
def calculateFoldability(seq,sec_str):

    #---------------------------#
    #seq = 'GUAAGUCGGGGACCUCUUAAGAUGAGAGACUUCUGAACCGGGUCAGGAUCGGAAGAUAGCAGCCCUAAGGAAAGGCCUUUUGUGCUAAGAGUCUUCUCUGACUUAC'
    #sec_str = '..(.((((((((.(((((.(((((((((..((((.....((((....(((....)))....))))...))))....))))))).)))))))...)))))))))...'
    ##--    -------------------------
    #---------------------------#
    real_RNA_loc = "RNASTRAND_real_feature_space.csv"
    folder_simulation_result = "/home/dhrumil/Desktop/Lab/RNAWebsite_v1/RNASTRAND_extract_feature/"
    #---------------------------#

    df_pred = extract_features.extract_features(seq,sec_str)
    clf = bulid_model(real_RNA_loc,folder_simulation_result)
    foldability = pred_foldability(df_pred,clf)
    fe,mfe,mfe_str = pred_fe(seq,sec_str)
    print('sequence:',seq)
    print('secondary structure:',sec_str)
    return foldability,fe,mfe,mfe_str
示例#37
0
def validate(data_folder=VALIDATE_FOLDER,
             model_prefix=MODEL_PREFIX,
             cdf_prediction=PREDICT_CDF):
    """Use ensemble to predict CDFs on validation set."""

    study_ids, sys_lab, dia_lab, features = \
        extract_features.extract_features(data_folder)
    assert study_ids == tuple(range(501, 701))
    assert len(sys_lab) == len(dia_lab) == features.shape[0]

    print 'total data', features.shape[0]
    print 'num features', features.shape[1]

    systole_prefix = '%s_sys' % model_prefix
    diastole_prefix = '%s_dia' % model_prefix

    systole_cdfs = predict_cdfs(features, systole_prefix, cdf_prediction)
    diastole_cdfs = predict_cdfs(features, diastole_prefix, cdf_prediction)
    return systole_cdfs, diastole_cdfs
示例#38
0
        #print "\tProcessing: " + names[i*2][0];
        f1 = INPUT_PATH + names[i*2][2]
        f2 = INPUT_PATH + names[i*2+1][2]
        name = names[i*2][0];
        out = CHECK_PATH + name + '.fea.res';
        check_f = CHECK_PATH + name + '.fea';
        if name not in test_result:
            test_result[name] = (0,0,0, False);
        if os.path.exists(check_f) != True: 
            print >> sys.stderr, "Output test file was not created";
            continue;
        if os.path.exists(out):
            os.remove(out);

        t1 = time.time() 
        extract_features.extract_features(f1, f2, out);
        t2 = time.time()
        if os.path.exists(out)!= True: 
            print >> sys.stderr, "Output test file was not created";
            continue;
        result = read_feas(out)
        pattern = read_feas(check_f)
        t = test_result[name][0] +(t2-t1);
        w = test_result[name][1] + compare_results(result, pattern);
        c = test_result[name][2] + float(len(result)) / float(len(pattern));
    

        test_result[name] = (t, w, c , True );
        
count_fails=0;    
for name in test_result.keys():
    model = model_from_json(json_string)
    model.load_weights("model_weights/super_awesome_merged_model_weights.hdf5")
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy']
                  )
    # Parse song
    if len(sys.argv) < 2:
        print("missing parameter")
    else:
        filename = sys.argv[1]
        song_folder = os.path.dirname(os.path.realpath(filename))#should get the directory to the file

        if os.path.isdir(filename):
            batch_thirty_seconds(song_folder)
            extract_features(song_folder)
        else:
            thirty_seconds(filename)
            print("File split. Now extracting features.")
            extract_features(song_folder)
            print("Extracted features.")
             
        keyword_2 = "mfcc_coefficients"

        x2 = []
        for root, dirs, files in os.walk(song_folder, topdown=False):
            for name in files:
                if re.search(keyword_2+".csv",name):
                    song_path = (os.path.join(root,name))

                    song_features = genfromtxt(song_path, delimiter=",")
示例#40
0
    # Train with 3/4 of the sample + 1/4 of the error
    #
    error_count = sum([e.shape[0] for e in error])
    keep_count = max(error_count * 3, npts_sampled - error_count)
    if coords.shape[0] > keep_count:
        coords = coords[r.permutation(coords.shape[0])[:keep_count], :]
    coords = np.vstack([coords] + error)
p = r.permutation(coords.shape[0])[:npts_sampled]
coords = coords[p, :]
coords = coords[np.lexsort(coords.transpose())]

if ("training_features" in tiffcvt.h5_file.keys() and 
    (tiffcvt.h5_file["training_features"].shape[1] != extract_features.n_features or
     tiffcvt.h5_file["training_features"].shape[0] != npts_sampled)):
    del tiffcvt.h5_file["training_features"]
    del tiffcvt.h5_file["training_classification"]
tf = tiffcvt.h5_file.require_dataset("training_features", 
                                     (npts_sampled, extract_features.n_features),
                                     np.float32)
tc = tiffcvt.h5_file.require_dataset("training_classification", 
                                     (npts_sampled, ), np.uint32)
for i in range(0, coords.shape[0], 1024):
    my_slice = slice(i, min(i+1024, coords.shape[0]))
    tf[my_slice,:] = extract_features.extract_features(
        img, blur_img, coords[my_slice,:])
    tc[my_slice] = labels[coords[my_slice,0],
                          coords[my_slice,1],
                          coords[my_slice,2]]
    print "Finished %d of %d" % (i+1024, npts_sampled)
tiffcvt.h5_file.close()
 def align_and_tune(self):
     extract_features(self.original_audio_folder, self.original_features_folder, [features[0]], "jams")
     tune_wavs(self.original_audio_folder, self.tuned_audio_folder, self.original_features_folder)
示例#42
0
    parser.add_argument('-v', '--verbose',
                        dest='verbose', action='store_true', default=False,
                        help='turn on verbose message output')
    options = parser.parse_args()

    # establish MongoDB connection
    collection = myutils.get_mongodb_collection(options.hosts, options.database)

    # load models for each label
    models = test.load_models(collection['models'], ast.literal_eval(options.model))

    cursor = myutils.get_mysql_connection(options.host, options.db).cursor()
    # contruct the testing set from the MediaWiki table
    vectors = []
    for ent in wikilove_revs.get_entries(cursor, options.start, options.end, options.window, options.limit, newest=True):
        features = extract_features.extract_features({'entry': {'content': {'added': [ent.others.message], 'removed':[]},
                                                                'comment': ''}})
        vector = myutils.map_key_dict(int, extract_features.extract_vector(features, options.bits))
        if ent.receiver_id != ent.sender_id:
            vectors.append(myutils.entry_t(ent, features, vector))

    labels = sorted(models.keys())
    
    vecs = [x.vector for x in vectors]
    predictions = [[[] for y in xrange(0, len(labels))] for x in xrange(0,len(vectors))]
    for (n,lname) in enumerate(labels):
        lab,_,val = liblinear.linearutil.predict([0]*len(vecs), vecs, models[lname], '-b 1')
        for (i,(pred,score)) in enumerate(zip(lab,val)):
            predictions[i][n] = score[1] # get the confidence for the label being 'True'

    print >>options.output, '<style type="text/css">.prediction{text-align: right;} td{vertical-align: top;} li{border: 1px solid; list-style: none inside; margin: 0.2em;} ul{padding: 0;} blockquote{ font: normal italic  100% serif; }</style>'
    print >>options.output, '<body style="background: #EEE;">Generated at %s.' % str(datetime.now())
 def realign_and_separate_and_analyze(self):
     extract_features(self.tuned_audio_folder, self.tuned_features_folder, [features[0]], "jams")
     separate_channels(self.tuned_audio_folder, self.channels_audio_folder)
     copy_features_of_separated_channels(self.tuned_features_folder, self.channels_features_folder)
     extract_features(self.channels_audio_folder, self.channels_features_folder, features[1:], "jams")