示例#1
0
def telemetry(sid, data):
    if data:
        # The current steering angle of the car
        steering_angle = data["steering_angle"]
        # The current throttle of the car
        throttle = data["throttle"]
        # The current speed of the car
        speed = data["speed"]
        # The current image from the center camera of the car
        imgString = data["image"]
        image = Image.open(BytesIO(base64.b64decode(imgString)))
        image_array = np.asarray(image)
        image_array = functions.preprocess(image_array)
        image = np.array([image_array])
        steering_angle = float(
            model.predict(image_array[None, :, :, :], batch_size=1))

        throttle = controller.update(float(speed))

        print(steering_angle, throttle)
        send_control(steering_angle, throttle)

        # save frame
        if args.image_folder != '':
            timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3]
            image_filename = os.path.join(args.image_folder, timestamp)
            image.save('{}.jpg'.format(image_filename))
    else:
        # NOTE: DON'T EDIT THIS.
        sio.emit('manual', data={}, skip_sid=True)
示例#2
0
def main():

    # Load training data from csv
    filename = 'train_data.csv'
    raw_data = np.loadtxt(filename, dtype=np.str, delimiter=",")
    image_paths = raw_data[:, 0]
    labels = raw_data[:, 1]

    # One-hot encode labels
    lb = preprocessing.LabelBinarizer()
    one_hot_labels = lb.fit_transform(labels)
    # Save transformation matrix
    functions.save_object(lb, 'one-hot-matrix.pkl')

    # Load training images from directory
    images = []
    for path in image_paths:
        images.append(imread(path))
    images = np.array(images)

    # pre-process images
    images = np.array([functions.preprocess(img) for img in images])

    # Split data into training and validation sets
    x_train, x_valid, y_train, y_valid = train_test_split(images, one_hot_labels, test_size=VALIDATION_SPLIT)

    # Augment training set with rotated and flipped images
    x_train, y_train = functions.augment_dataset(x_train, y_train)
示例#3
0
def preprocessing_test(mat):
    """
    Checks if a matrix matches its original version after being preprocessed and reverse_preprocessed.
    """
    means, stds, maxes, temp = f.preprocess(mat)
    temp2 = f.reverse_preprocess(means, stds, maxes, temp)
    diff = np.round(mat - temp2, 10)
    return np.all(diff == 0.)
示例#4
0
def main():
    positive_set = 'test_extractions/bc_samples.txt' #'test_extractions/test-neural-hash-samples.txt' 
    negative_set = 'test_extractions/bc_grounds.txt' #'test_extractions/test-neural-hash-ground.txt' 
    analogy_list = functions.get_list_re(positive_set)
    non_analogy_list = functions.get_list_re(negative_set)
    samples = [(text, 'YES') for text in analogy_list] + [(text, 'NO') for text in non_analogy_list]
    train_data, train_labels, test_data, test_labels = functions.preprocess(samples, 0.5)
    pipeline = []
    classifiers = ['svc', 'linearsvc', 'nusvc', 'naive', 'maxEnt', 'neural']
    classifiers2 = ['neural']
    representations = ['tfidf', 'count', 'hash']
    representations2 = ['hash']
    
    for classifier in classifiers:
        for representation in representations:
            pipeline = (Pipeline([(representation, helpers.get_function(representation)),
                          (classifier, helpers.get_function(classifier)),]))
            parameters = helpers.generate_parameters(representation, classifier)
            grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, error_score=-1)
            print("Performing grid search...")
            print("pipeline:", [name for name, _ in pipeline.steps])
            print("parameters:")
            pprint(parameters)
            t0 = time()
            grid_search.fit(train_data, train_labels)
            print("done in %0.3fs" % (time() - t0))
            print()

            print("Best score: %0.3f" % grid_search.best_score_)
            print("Best parameters set:")
            best_parameters = grid_search.best_estimator_.get_params()
            for param_name in sorted(parameters.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))
            print()
            
            print("Getting the confusion matrix for the best estimator:")
            prediction = grid_search.best_estimator_.predict(test_data)
            matrix = confusion_matrix(test_labels, prediction, labels = ['YES', 'NO'])
            precision, recall, f_measure = functions.fmeasure(matrix)
            accuracy = accuracy_from_matrix(matrix)
            print("Accuracy ", accuracy)
            print("Precision, recall, f-score:")
            print(precision, recall, f_measure)
            print(matrix)
            print()
示例#5
0
def analogy_pipeline(positive_set,
                     negative_set,
                     percent_test,
                     representation,
                     classifier,
                     seed,
                     extra={"sub_class": ""},
                     timer=1000000000):
    start = time.time()
    # Read in the set of positive examples
    analogy_list = functions.get_list_re(positive_set)
    # Read in the set of negative examples
    non_analogy_list = functions.get_list_re(negative_set)
    # Randomly divide them into a training set and a test set
    nan_set = functions.read_CSV('corpora/dmb_open_test.csv', 1)
    samples = [(text, 'YES') for text in analogy_list] + [
        (text, 'NO') for text in non_analogy_list
    ] + [(txt, 'NO') for txt in nan_set]
    bt_parsed = functions.readCSV('base_target.csv', 1)
    extra = functions.set_extra(extra)
    num_samples = min(len(analogy_list), len(non_analogy_list))
    # Run classifier, generate results based on the value passed in for representation
    beginTimer = time.time()
    train_data, train_labels, test_data, test_labels = functions.preprocess(
        bt_parsed, percent_test, seed, num_samples,
        'test_main_interface_output')
    # Make sure the classifier runs within a set time
    seed = (seed - 1000) / 30
    dic = {'data': []}
    for dat in test_data:
        dic['data'].append(dat)
    pd.DataFrame(dic, columns=['data']).to_csv('./testing/test_set' +
                                               str(int(seed)) + '.csv')
    # train_data = functions.strip_id(train_data)
    # test_data = functions.strip_id(test_data)
    score, matrix, precision, recall, f_measure = functions.classify_pipeline(
        train_data, train_labels, test_data, test_labels, classifier,
        representation, seed, extra, timer)
    print(score)
    print(matrix)
    print(precision, recall, f_measure)
    return score, f_measure
示例#6
0
def main():

    # Load one-hot-encoding matrix
    labeler = functions.load_object('one-hot-matrix.pkl')

    # Load the trained model
    model = load_model('model.h5')

    # Define the webcam object
    cam = cv2.VideoCapture(0)

    while (True):

        # Capture video frame
        ret, frame = cam.read()

        # Preprocess frame
        image = functions.preprocess(frame)

        # Predict object in frame
        logits = model.predict(image, batch_size=1)

        # Decode logits
        result = labeler.inverse_transform(logits)
# Check the environment
warnings.simplefilter('ignore')
np.random.seed(0)
if six.PY3:
    tff.framework.set_default_executor(tff.framework.create_local_executor())
tff.federated_computation(
    lambda: 'The tensorflow federated environment is correctly setup!')()

# Load the data
emnist_train, emnist_test = load_data(path)

# Generate sample batch
example_dataset = emnist_train.create_tf_dataset_for_client(
    emnist_train.client_ids[1])
example_element = iter(example_dataset).next()
preprocessed_example_dataset = preprocess(example_dataset, NUM_EPOCHS,
                                          SHUFFLE_BUFFER, BATCH_SIZE)
sample_batch = tf.nest.map_structure(lambda x: x.numpy(),
                                     iter(preprocessed_example_dataset).next())

# Create federated data for each client
sample_clients = emnist_train.client_ids[0:NUM_CLIENTS]
federated_train_data = make_federated_data(emnist_train, sample_clients,
                                           NUM_EPOCHS, SHUFFLE_BUFFER,
                                           BATCH_SIZE)


# Function to create tff,learning instances
def model_fn():
    keras_model = create_compiled_keras_model()
    return tff.learning.from_compiled_keras_model(keras_model, sample_batch)
示例#8
0
    def extract_patches(self, h5db, new_folder):
        print 'OpenSlide needed to extract patches.'
        return None
        '''
        for centre in self.centres:
            print('[cnn][patch_extraction] Selected Centre: ', centre)
            # each centre may have more than one annotation XML file, so here we retrieve
            # a list of all the XMLs related to the current centre
            annotation_list = np.sort(self.get_annotation_list(centre, self.xml_source_fld))
            # for each XML file in the annotation list
            # we want to extract tumor and normal patches
            for xml_file in annotation_list:
                files_counter +=1 # variable to shape the final data vector
        '''
        print('[debug] ', self.name)
        print('[debug] ', self.settings)

        self.set_files_counter(self.count_annotation_files())

        print('[dataset] {0} [extract_patches] {1} total annotation files.'.
              format(self.name, self.files_counter))

        for centre in self.centres:
            annotation_list = self.get_annotation_list(centre)
            for xml_file in annotation_list:
                slide_path = self.get_wsi_path(centre, xml_file)
                xml_path = os.path.join(self.xml_source_fld, xml_file)
                # retrieving the information about the file analysed.
                #   info is a dictionary with the following keys:
                #   info['centre'], current centre number
                #   info['patient'], current patient number
                #   info['node'], current WSI node
                info = self.get_info(xml_path, centre)
                #functions.setDBHierarchy(h5db, self.settings,info)
                if info['patient'] == '008_Mask.tif':
                    continue
                if xml_path != None:  ## add check slide is open and ok
                    # preprocess takes the WSI path, and the slide_level and returns the
                    # the WSI openslide obj, the tumor annotation mask, the WSI image
                    # and the tumor contours

                    if self.name == 'camelyon16':
                        print('import openslides')
                        #slide = openslide.OpenSlide(slide_path)
                        #rgb_im = np.array(slide.read_region((0,0),7,slide.level_dimensions[7]))
                        #mask_file = xml_path+'Tumor_{}_Mask.tif'.format(info['patient'])
                        #import pdb; pdb.set_trace()
                        annotations = np.asarray(
                            openslide.OpenSlide(xml_path).read_region(
                                (0, 0), 7, slide.level_dimensions[7]))
                        annotations_mask = annotations[:, :, 0]
                        #import pdb; pdb.set_trace()
                        im_contour = rgb_im

                    else:
                        import pdb
                        pdb.set_trace()
                        slide, annotations_mask, rgb_im, im_contour = functions.preprocess(
                            slide_path,
                            xml_path,
                            slide_level=self.settings['slide_level'])

                    tum_patch_list, tum_patch_point = integral.patch_sampling_using_integral(
                        slide, self.settings['slide_level'], annotations_mask,
                        self.settings['patch_size'],
                        self.settings['n_samples'])
                    # conversion of the lists to np arrays
                    tum_patch_array = np.asarray(tum_patch_list)
                    #import pdb; pdb.set_trace()
                    tum_locations = np.array(tum_patch_point)
                    # storage in the HDF5 db
                    self.store(h5db, info, tum_patch_array, tum_locations,
                               'tumor')

                    # reverting the tumor mask to find normal tissue and extract patches
                    #    Note :
                    #    normal_mask = tissu mask(morp_im) - tummor mask(annotations_mask)

                    ##### restart from here ##

                    morp_im = functions.get_morp_im(rgb_im)
                    normal_im = morp_im - annotations_mask  ## np.min(normal_im) := -1.0
                    normal_im = normal_im == 1.0
                    normal_im = (normal_im).astype(int)
                    # sampling normal patches with uniform distribution
                    nor_patch_list, nor_patch_point = integral.patch_sampling_using_integral(
                        slide, self.settings['slide_level'], normal_im,
                        self.settings['patch_size'],
                        self.settings['n_samples'])
                    nor_patch_array = np.asarray(nor_patch_list)
                    normal_patches_locations = np.array(nor_patch_point)
                    # storing the normal patches and their locations
                    self.store(h5db, info, nor_patch_array, nor_patch_point,
                               'normal')
                    ''' Visualisation '''

                    # plotting the tumor locations in the XML file
                    # Drawing the normal patches sampling points
                    # tumor_locations.png shows the tumor patches locations in red
                    # and the normal patches locations in green
                    tumor_locations_im = rgb_im
                    plt.figure()
                    plt.imshow(tumor_locations_im)
                    for p_x, p_y in normal_patches_locations:
                        plt.scatter(p_y, p_x, c='g')
                        #cv2.circle(tumor_locations_im,(p_y,p_x),30,(0,255,0),10)
                    for p_x, p_y in tum_locations:
                        plt.scatter(p_y, p_x, c='r')
                        #cv2.circle(tumor_locations_im,(p_y,p_x),30,(255,0,0), 10)
                    print(
                        '[cnn][patch_extraction] Saving tumor locations image')
                    plt.savefig(
                        os.path.join(
                            new_folder,
                            'level{}_centre{}_patient{}_node{}_tumor_locations.png'
                            .format(self.settings['slide_level'],
                                    info['centre'], info['patient'],
                                    info['node'])))
                    plt.close()
                    #print('Saving tumor locations image')
                    #plt.savefig('tumor_locations_patient0{}_node{}'.format(info['patient'], info['node']))

                    print(
                        '[cnn][patch_extraction] Saving annotation mask and normal tissue mask'
                    )
                    plt.figure()
                    plt.imshow(annotations_mask)
                    plt.savefig(
                        os.path.join(
                            new_folder,
                            'level{}_centre{}_patient{}_node{}_annotation_mask.png'
                            .format(self.settings['slide_level'],
                                    info['centre'], info['patient'],
                                    info['node'])))
                    plt.close()

                    plt.figure()
                    plt.imshow(normal_im)
                    plt.savefig(
                        os.path.join(
                            new_folder,
                            'level{}_centre{}_patient{}_node{}_normal_tissue_mask.png'
                            .format(self.settings['slide_level'],
                                    info['centre'], info['patient'],
                                    info['node'])))
                    plt.close()
                    plt.close('all')

                    self.tum_counter += len(tum_patch_array)
                    self.nor_counter += len(nor_patch_array)
                    #self.nor_counter = 0
        return
示例#9
0
def analogy_trial(positive_set,
                  negative_set,
                  percent_test,
                  representation,
                  classifier,
                  extra={"sub_class": ""},
                  timer=1000000000,
                  comment=""):
    caller = inspect.stack()[1][3]
    start = time.time()
    # Read in the set of positive examples
    analogy_list = functions.get_list_re(positive_set)
    # Read in the set of negative examples
    non_analogy_list = functions.get_list_re(negative_set)
    # Randomly divide them into a training set and a test set
    samples = [(text, 'YES')
               for text in analogy_list] + [(text, 'NO')
                                            for text in non_analogy_list]
    extra = functions.set_extra(extra)
    # Run classifier, generate results based on the value passed in for representation
    beginTimer = time.time()
    now = time.strftime("%c")
    currentTime = now
    now = now.replace(" ", "_")
    now = now.replace(":", "")

    train_data, train_labels, test_data, test_labels = functions.preprocess(
        samples, percent_test, caller)
    # Make sure the classifier runs within a set time
    try:
        score, matrix, precision, recall, f_measure = functions.classify(
            train_data, train_labels, test_data, test_labels, classifier,
            representation, extra, timer)

    # catch the timeout error
    except timeout.TimeoutError:
        print("Classifier timeout.")
        print("Output error in log.")
        algoTime = time.time() - beginTimer
        runTime = time.time() - start
        outputData = [
            currentTime, positive_set, negative_set, percent_test,
            representation, classifier, extra, "", "", "", "", "", "", "",
            "Algorithm Timeout"
        ]

    else:
        algoTime = time.time() - beginTimer
        runTime = time.time() - start
        outputData = [
            currentTime, positive_set, negative_set, percent_test,
            representation, classifier, extra, score, matrix, precision,
            recall, f_measure, runTime, algoTime, comment
        ]

    # Store results
    outputResults(outputData)
    if caller != "test_main_interface_output":
        print("Successfully logged trial results")
    outputData = outputData[7:-3]
    outputData[1] = outputData[1].tolist()
    return outputData
示例#10
0
    def extract_patches(self):
        """
        (more doc please)
        """
        errors = 0
        warnings = 0
        settings = self.config['settings']
        for centre in self.centres:
            for patient in self.get_patients(centre):
                self.logger.info('processing patient: {}'.format(patient))

                slide_path = self.get_wsi_path(centre, patient)
                xml_path = self.get_annotation_path(centre, patient)
                info = self.get_info(centre, patient)

                pat_res_dir = self.make_patient_dir(info)
                if not pat_res_dir:
                    self.logger.error(
                        "patient {}: problems with results dir...".format(
                            patient))
                    errors += 1
                    continue

                h5db_path = os.path.join(pat_res_dir, self.h5db_bname + '.h5')
                try:
                    h5db = hd.File(h5db_path, 'w')
                except Exception as e:
                    self.logger.error(
                        "patient {}: can't open my H5 DB '{}': {} ".format(
                            patient, h5db_path, e))
                    errors += 1
                    continue

                slide, annotations_mask, rgb_im, im_contour = preprocess(
                    slide_path, xml_path, slide_level=settings['slide_level'])

                # reverting the tumor mask to find normal tissue and extract patches
                # Note :
                #    normal_mask = tissu mask(morp_im) - tummor mask(annotations_mask)
                morp_im = get_morp_im(rgb_im)
                normal_im = morp_im - annotations_mask  # np.min(normal_im) := -1.0
                normal_im = normal_im == 1.0
                normal_im = (normal_im).astype(int)

                # masks are the same for any sample batch ;-)
                # [TO-DO] make switchable from config/CL
                plt.figure()
                plt.imshow(annotations_mask)
                img_file = self.get_image_fname(pat_res_dir, 'annotation_mask',
                                                info)
                plt.savefig(img_file)
                plt.close()
                self.logger.info(
                    'patient {}: Annotation mask image saved to: {}'.format(
                        patient, img_file))

                plt.figure()
                plt.imshow(normal_im)
                img_file = self.get_image_fname(pat_res_dir,
                                                'normal_tissue_mask', info)
                plt.savefig(img_file)
                plt.close()
                self.logger.info(
                    'patient {}: Normal tissue mask image saved to: {}'.format(
                        patient, img_file))

                opts = dict(
                    map(lambda k: (k, settings[k]), (
                        'area_overlap',
                        'bad_batch_size',
                        'gray_threshold',
                        'margin_width_x',
                        'margin_width_y',
                        'method',
                        'n_samples',
                        'patch_size',
                        'slide_level',
                        'white_level',
                        'white_threshold',
                        'white_threshold_incr',
                        'white_threshold_max',
                    )))

                # batch sample & store -- keep it small to avoid OOM!  In
                # "linear" sampling mode, more batches might be needed, so go
                # for a run and get the extracted pathes and the last
                # index. Loop until no patches come out

                # [TO-DO] store info in _per-patient_ H5 DB

                # a patient case (:= slide) the tumor annotation mask is
                # usually (much) smaller than the normal tissue mask, thus a
                # different number of batches is needed to extract all the
                # tumor and normal patches. So we compute then normal tissue
                # mask once. Apart from that, there's no relation between
                # tumor and normal patches, hence we batch-loop two times: a
                # first time for the tumor case and a second time for the
                # normal case. N.B. In 'random' sampling mode, just one batch
                # is ever done.

                index = 0  # ignored in 'random' mode -- only one batch done
                tum_patch_point = []
                bcnt_t, bcnt_n = 0, 0
                last_idx_t = last_idx_n = -1

                if settings['window']:
                    self.logger.info(
                        "patient {}: restricting nonzero points range to {}%, {}%"
                        .format(patient, settings['window'][0],
                                settings['window'][1]))
                nzx_n, nzy_n = integral.nonzero_range(normal_im,
                                                      settings['window'])

                # *** Warning! *** Split loops doesn't work if we want to show
                # images: there's data dependency on "normal_patches_locations".

                # normal tissue
                while (True):
                    self.logger.info(
                        "patient {}: >>> [normal] starting batch {}".format(
                            patient, bcnt_n))

                    opts['start_idx'] = last_idx_n + 1
                    nor_patch_list, nor_patch_point, last_idx_n = integral.patch_sampling(
                        slide, normal_im, nzx_n, nzy_n, **opts)
                    if nor_patch_point and nor_patch_list:
                        nor_patch_array = np.asarray(nor_patch_list)
                        normal_patches_locations = np.array(nor_patch_point)
                        self.store_patient(info, nor_patch_array,
                                           nor_patch_point, 'normal', h5db,
                                           bcnt_n)
                    else:
                        self.logger.info(
                            'patient {}: batch {}: no (more) normal patches'.
                            format(patient, bcnt_n))
                        break

                    self.nor_counter += len(nor_patch_array)

                    self.logger.info(
                        "patient {}: <<< [normal] done batch {}".format(
                            patient, bcnt_n))

                    if last_idx_n == None:
                        # in 'random' method, this tells us that we're done sampling
                        break

                    bcnt_n += 1
                # {end-while}

                # TO-DO: batch runs should be better encapsulated (aux fun/method)...

                # tumors masks are usually too small for windowed sampling, so
                # take the full range
                nzx_t, nzy_t = integral.nonzero_range(annotations_mask, [])
                while (True):

                    self.logger.info(
                        "patient {}: >>> [tumor] starting batch {}".format(
                            patient, bcnt_t))

                    opts['start_idx'] = last_idx_t + 1
                    tum_patch_list, tum_patch_point, last_idx_t = integral.patch_sampling(
                        slide, annotations_mask, nzx_t, nzy_t, **opts)
                    if tum_patch_list and tum_patch_point:
                        tum_patch_array = np.asarray(tum_patch_list)
                        tum_locations = np.array(tum_patch_point)
                        self.store_patient(info, tum_patch_array,
                                           tum_locations, 'tumor', h5db,
                                           bcnt_t)
                    else:
                        self.logger.info(
                            'patient {}: batch {}: no (more) tumor patches'.
                            format(patient, bcnt_t))
                        break

                    if opts['method'] == 'random':
                        if bcnt_n != bcnt_t:
                            self.logger.error(
                                "[BUG] Can't make scatter image(s): batch count mismatch"
                            )
                            errors += 1
                        else:
                            # plotting the tumor locations in the XML file Drawing the
                            # normal patches sampling points tumor_locations.png shows the
                            # tumor patches locations in red and the normal patches
                            # locations in green
                            tumor_locations_im = rgb_im
                            plt.figure()
                            plt.imshow(tumor_locations_im)
                            # Warning! Data dependency on previous normal batch run
                            for p_x, p_y in normal_patches_locations:
                                plt.scatter(p_y, p_x, c='g')
                            for p_x, p_y in tum_locations:
                                plt.scatter(p_y, p_x, c='r')

                            img_file = self.get_image_fname(
                                pat_res_dir, 'tumor_locations', info, bcnt_t)
                            plt.savefig(img_file)
                            plt.close()
                            self.logger.info(
                                'patient {}: batch {}: tumor locations image saved to: {}'
                                .format(patient, bcnt_t, img_file))

                    self.tum_counter += len(tum_patch_array)

                    self.logger.info(
                        "patient {}: <<< [tumor] done batch {}".format(
                            patient, bcnt_t))

                    if last_idx_t == None:
                        # in 'random' method, this tells us that we're done sampling
                        break

                    bcnt_t += 1
                # {end-while}

                h5db.close()
                self.logger.info(
                    "patient {}: processed in {} (normal) + {} (tumor) batches"
                    .format(patient, bcnt_n, bcnt_t))
                self.logger.info("patient {}: data saved to H5 DB: {}".format(
                    patient, h5db_path))
            # {end-for-patient}
        # {end-for-centre}

        self.report['errors'] = errors
        self.report['warnings'] = warnings
示例#11
0
        else:
            logging.basicConfig(level=logging.INFO)
    logging.info(
        f"using {m}.{e} model to calculate submitid {i}") if v else None

    # load word embedding model
    start = datetime.now()
    vectors = load_model(m, e)
    logging.info(f"model loaded in {datetime.now() - start}") if v else None

    # get source code and problem text from database that corresponds with input submit ID
    code, problem = get(i)

    # preprocessing includes normalization and tokenization
    logging.info("preprocessing code and problem text...") if v else None
    problem_processed, comments_processed, code_only = preprocess(
        problem, code)
    # count words in code comment
    comment_word_count_raw = 0
    for line in comments_processed:
        comment_word_count_raw += len(line)
    logging.info("preprocessing finished") if v else None

    # calculate code density
    logging.info("calculating code density...") if v else None
    comment_line_density, comment_char_density = calculate_density(
        comments_processed, code_only)
    logging.info("finished calculating") if v else None

    # calculate code header score
    logging.info("calculating header score...") if v else None
    header_score = calculate_header_score(comments_processed)
示例#12
0
def main():
    positive_set = '../latest_analogy/test_extractions/bc_samples.txt'  #'test_extractions/test-neural-hash-samples.txt'
    negative_set = '../latest_analogy/test_extractions/bc_grounds.txt'  #'test_extractions/test-neural-hash-ground.txt'
    analogy_list = functions.get_list_re(positive_set)
    non_analogy_list = functions.get_list_re(negative_set)
    samples = [(text, 1)
               for text in analogy_list] + [(text, 0)
                                            for text in non_analogy_list]
    train_data, train_labels, test_data, test_labels = functions.preprocess(
        samples, 0.15)
    overlap_input = [('LP', 'count'), ('TSVM', 'tfidf')]
    rng = np.random.RandomState(42)
    random_unlabeled_points = rng.rand(len(train_labels)) < 0.7
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)
    train_labels[random_unlabeled_points] = -1
    train_data = np.array(train_data)
    prediction_second_input = []
    pipeline = []
    no_as_yes = []  # predictions with label NO classified with label YES
    yes_as_no = []  # predictions with label YES classified with label NO
    count = 0

    for element in overlap_input:
        pipeline = helpers.get_function(element[0])
        representation = helpers.get_function(element[1])
        parameters = helpers.get_parameters(element[0])
        train_set = representation.fit_transform(train_data).toarray()
        test_set = representation.transform(test_data).toarray()
        grid_search = GridSearchCV(pipeline,
                                   parameters,
                                   n_jobs=-1,
                                   verbose=10,
                                   error_score=-1)
        grid_search.fit(train_set, train_labels)
        if count == 0:
            prediction = grid_search.best_estimator_.predict(test_set)
            matrix = confusion_matrix(test_labels, prediction, labels=[1, 0])
        else:
            prediction_second_input = grid_search.best_estimator_.predict(
                test_set)
            matrix = confusion_matrix(test_labels,
                                      prediction_second_input,
                                      labels=[1, 0])
        count += 1
        print(matrix)

    for i in range(len(test_labels)):
        #print(test_labels[i], prediction[i], prediction_second_input[i])
        if (test_labels[i] != prediction[i]) and (
                prediction[i] == prediction_second_input[i]):
            if test_labels[i] == 0:
                no_as_yes.append(test_data[i])
            else:
                yes_as_no.append(test_data[i])

    print("Overlapping NO as YES:")
    l1 = len(no_as_yes)
    print("Number: ", l1)
    for i in range(l1):
        print(no_as_yes[i])
    print("Overlapping YES as NO:")
    l2 = len(yes_as_no)
    print("Number: ", l2)
    for i in range(l2):
        print(yes_as_no[i])
from scikitTSVM import SKTSVM
import warnings
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

tsvm = SKTSVM(probability=False, C=0.01, gamma=1.0, kernel='linear', lamU=1.0)
percent_test = 0.15
positive_set = 'data/bc_samples.txt'
negative_set = 'data/bc_grounds.txt'
unlabeled_set = 'data/unlabeled-data.csv'
analogy_list = functions.get_list_re(positive_set)
non_analogy_list = functions.get_list_re(negative_set)
unlabeled_list = functions.get_list_re(unlabeled_set)
samples = [(text, 1) for text in analogy_list] + [(text, 0)
                                                  for text in non_analogy_list]
train_data, train_labels, test_data, test_labels = functions.preprocess(
    samples, percent_test)
j = 0
for sample in unlabeled_list:
    if j <= 20000:
        train_data.append(sample)
        train_labels.append(-1)
    j += 1
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
train_data = np.array(train_data)
TfidfVect = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
train_set = TfidfVect.fit_transform(train_data).toarray()
test_set = TfidfVect.transform(test_data).toarray()

# Label Propagation
"""
示例#14
0
# Couple of ways included to create the pandas dataframe, one from sqlite, one via path, and finally one via github

# Sqlite option
# # songs_df =  pd.read_sql_table('songs', 'sqlite:///db.sqlite3')

# path option
# songs_df = pd.read_csv('../Data/SpotifyAudioFeaturesApril2019_duplicates_removed.csv')

# github option
infile = "https://raw.githubusercontent.com/spotify-recommendation-engine-3/data_science/master/Data/SpotifyAudioFeaturesApril2019_duplicates_removed.csv"
songs_df = pd.read_csv(infile)

y = songs_df[songs_df.columns[:3]]
X = songs_df[songs_df.columns[3:]]

my_model = create_model(preprocess(X))


@app.route('/', methods=['GET', 'POST'])
def plot_png():
    fig = create_figure()
    output = io.BytesIO()
    FigureCanvas(fig).print_png(output)
    return Response(output.getvalue(), mimetype='image/png')


def create_figure():
    song_df = songs_df.sample()
    song_df = song_df.iloc[:, 3:]
    songs_to_plot = suggest_songs(song_df, songs_df, y, my_model)
    fig = Figure(figsize=(9, 9), edgecolor='gray')