Exemplo n.º 1
0
def classify():
    """

    Takes post data structured like this:
    {
      "data":["I like them apples", "I prefer green apples"]
    }
    """
    classifiers_to_use = request.args.get('classifiers')
    use_new_classifier = request.args.get('new_cls')
    
    if classifiers_to_use:
        classifiers_to_use = classifiers_to_use.split(',')
    elif use_new_classifier:
        use_new_classifier = (use_new_classifier.lower() == 'true')
    
    post_payload = request.get_json(force=True)
    cl = Classification()
    data = post_payload['data']
    predictions = list(cl.single_classification(tuple(data), to_json=True, classifiers_to_include=classifiers_to_use, use_new_classifier=use_new_classifier))
    
    for text, preds in zip(data, predictions):
        preds['text'] = text
    
    response = {'count':len(predictions),
                'data':predictions}
    return jsonify(response)
Exemplo n.º 2
0
    def __init__(self,environment,project):
        Classification.__init__(self,environment)

        self.project = project

        self.num_retired = None
        self.non_blanks_retired = None

        self.to_retire = None
Exemplo n.º 3
0
    def __init__(self,environment,project):
        Classification.__init__(self,environment)

        self.project = project

        self.num_retired = None
        self.non_blanks_retired = None
        # to know how often we should call Panoptes to get a new token
        # save on having to make unnecessary calls
        self.token_date = datetime.datetime.now()
        self.to_retire = set()

        self.total_retired = 0
Exemplo n.º 4
0
    def __init__(self,environment,param_dict):
        Classification.__init__(self,environment)
        assert isinstance(param_dict,dict)

        # to retire subjects, we need a connection to the host api, which hopefully is provided
        self.host_api = None
        self.project_id = None
        self.token = None
        self.workflow_id = None
        for key,value in param_dict.items():
            if key == "host":
                self.host_api = value
            elif key == "project_id":
                self.project_id = value
            elif key == "token":
                self.token = value
            elif key == "workflow_id":
                self.workflow_id = value

        assert (self.host_api is not None) and (self.project_id is not None) and (self.token is not None) and (self.workflow_id is not None)
Exemplo n.º 5
0
 def test_cost_zeros(self):
     # Cost should be 0.693 when theta is zeroed
     theta_z = pd.Series(np.zeros(len(self.X.columns)))
     clsfy = Classification(self.X, self.y)
     cost = clsfy._cost(theta_z)
     self.assertAlmostEqual(cost, 0.693, places=3)
Exemplo n.º 6
0
cols_list = []
while cols != "Y":
    cols = input()
    cols_list.append(cols)
data.scale_data(cols_list[:-1])

cols_list = data.get_cols()
y_column_name = input("enter y column name: ")
X_train, X_test, y_train, y_test = data.spilt_data(y_column_name)

model_type = input("Enter R for Regression and C for Classification: ")

if model_type == "C":
    print("Your options are: " + str(Classifier_list))  #add mode list
    modelname = input("Enter model to be used: ")
    classifier = Classification(X_train, X_test, y_train, y_test, modelname)
    classifier.predict()
    classifier.accuracy()
    classifier.save_model()

elif model_type == 'R':
    print("Your options are: " + str(Regressor_list))  #add mode list
    modelname = input("Enter model to be used, use A for all")
    if modelname == "A":
        for modelname in Regressor_list:
            regressor = Regression(X_train, X_test, y_train, y_test, modelname)
            regressor.predict()
            regressor.accuracy()
            regressor.save_model()
    else:
        regressor = Regression(X_train, X_test, y_train, y_test, modelname)
Exemplo n.º 7
0
class Wales(WaterAlgorithm):
    
    def __init__(self, O_O_distance=2.8, O_H_distance=1.0, intermediate_saves=[], folder="wales_21+", group_saves=[]):
        WaterAlgorithm()
        self.initialize(O_H_distance=O_H_distance, O_O_distance=O_O_distance, intermediate_saves=intermediate_saves, group_saves=group_saves, folder=folder, charge=1, do_symmetry_check=False, order=[3,14,9,2,10,19,16,15,7,18,4,0,17,1,12,6,5,8,20,11,13])
        self.N = 21
        self.classification = Classification(self) 

    def get_single_molecule_hydrogen_coordinates(self, site, water_orientation, i, oxygen_positions,  nearest_neighbors_nos, nn_periodicity, nn_periodicity_axis, cell):
        bvv = get_bond_variable_values_from_water_orientation(water_orientation)
        
        if water_orientation > 9:
            result = np.zeros((3,3))
        else:
            result = np.zeros((2,3))
        index = 0
        #print nearest_neighbors_nos
        for n,  x in enumerate(nearest_neighbors_nos):
            
            if bvv[n] == 1:
                # i == x means that this is a dangling bond
                if i == x:
                    com = oxygen_positions[3]
                    vector = oxygen_positions[i] - com
                    # normalize the vector  
                    vector_length = scipy.linalg.norm(vector)
                    vector /= vector_length
                    # the dangling hydrogen is along this vector
                    result[index] = np.array(oxygen_positions[i] + self.O_H_distance * vector)
                else:
                    result[index] = np.array(oxygen_positions[i] - (( self.O_H_distance * (oxygen_positions[i]-oxygen_positions[x])) / get_distance(oxygen_positions[i], oxygen_positions[x], False, None))) 
                index += 1
        #print result
        return result
        
        
    
    
    def get_all_oxygen_coordinates(self):
        """result = np.array(
       [[  0.000, 0.000,  0.000 ],
        [  0.427, -0.000,  0.565 ], 
        [  0.188, 0.577,  0.795 ], 
        [ -0.491, 0.357,  0.795 ],
        [ -0.491,-0.357,  0.795 ],
        [  0.188,-0.577,  0.795 ],
        [  0.982, 0.000,  0.188 ],
        [  0.304, 0.934,  0.188 ],
        [ -0.795, 0.577,  0.188 ],
        [ -0.795,-0.577,  0.188 ],
        [  0.304,-0.934,  0.188 ],
        [  0.645, 0.447, -0.118 ],
        [ -0.304, 0.934, -0.188 ],
        [ -0.702, 0.000, -0.158 ],
        [ -0.304,-0.934, -0.188 ],
        [  0.795,-0.577, -0.188 ],
        [  0.491, 0.357, -0.795 ],
        [ -0.188, 0.577, -0.795 ],
        [ -0.607, 0.000, -0.795 ],
        [ -0.188,-0.577, -0.795 ],
        [  0.491,-0.357, -0.795 ]]) * (self.O_O_distance / 0.713644)
        return result"""
        return read('optimal_wales.xyz').get_positions()

    def additional_requirements_met(self, water_orientation, water_orient, molecule_no):
        wo = water_orient.copy()
        if wo[molecule_no] != -1:
            return False
        wo[molecule_no] = water_orientation
        res, counts = self.classification.get_bond_types(wo)
        if counts[10][1] > 0:
            print water_orient
            print counts
            raw_input()
        # Check the number of AAD-AAD and ADD-ADD bonds
        if counts[4][0]+counts[8][0]> 2 or counts[3][0] > 1 or counts[10][1] > 0 or counts[11][1] > 0 or counts[12][1] > 0 or counts[13][1] > 0:
            #print "----------------"
            #print counts[4][0]+counts[8][0]
            #self.view_result(wo)
            #raw_input()
            #print 
            return False
        else:
            return True
Exemplo n.º 8
0
 def test_sigmoid_scalar(self):
     # for very negative numbers, sigmoid should equal zero, for very large, it should be 1
     self.assertAlmostEqual(Classification._sigmoid(-1000000), 0)
     self.assertAlmostEqual(Classification._sigmoid(1000000), 1)
Exemplo n.º 9
0
        "map_code",
        "ambito",
        "population_total",
        "population_male",
        "population_female",
        "dwellings_occupied",
    ]

    df = df[["state_code", "state_name", "municipality_code", "municipality_name", "locality_code", "locality_name"]]

    df.state_code = df.state_code.astype(str).str.zfill(2)
    df.municipality_code = df.municipality_code.astype(str).str.zfill(3)
    df.locality_code = df.locality_code.astype(str).str.zfill(4)

    df.municipality_code = df.state_code + df.municipality_code
    df.locality_code = df.municipality_code + df.locality_code

    df.state_name = df.state_name.str.title()
    df.municipality_name = df.municipality_name.str.title()
    df.locality_name = df.locality_name.str.title()

    h = Hierarchy(["state", "municipality", "locality"])

    parent_code_table = repeated_table_to_parent_id_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_mexico_inegi.csv")
    c.to_stata("out/locations_mexico_inegi.dta")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 64

# image size 3, 32, 32
# batch size must be an even number
# shuffle must be True
cifar_10_train_dt = CIFAR10(r'data', download=False, transform=ToTensor())
#dev = Subset(cifar_10_train_dt, range(128))
cifar_10_train_l = DataLoader(cifar_10_train_dt,
                              batch_size=batch_size,
                              shuffle=False,
                              pin_memory=torch.cuda.is_available())

encoder = models.Encoder()
classification = Classification().to(device)

root = Path(r'modified/models')
model_path = root / Path(r'encoder500.wgt')
encoder.load_state_dict(torch.load(str(model_path)))
encoder.to(device)
classification_optim = Adam(classification.parameters(), lr=1e-4)

epoch_restart = 50
root_classification_model = Path(r'classification_model_baseline_modified')

if epoch_restart > 0 and root is not None:
    classification_loss_file = root_classification_model / Path(
        'classification_loss' + str(epoch_restart) + '.wgt')
    classification.load_state_dict(torch.load(str(classification_loss_file)))
def trainNestedCV(direct, subject, session, filename, hyp_params, parameters):

    subj = load_subject(direct, subject, 1, filename)["subject"]
    #
    # data = subj.data3D.astype(np.float32) # convert data to 3d for deep learning
    # labels = subj.labels.astype(np.int64)
    # labels[:] = [x - 1 for x in labels]
    data, labels = format_data('words', subject, 4096)

    import random  #just for testing
    labels = []  #just for testing
    for i in range(200):  #just for testing
        labels.append(random.randint(0, 3))  #just for testing

    labels = np.array(labels).astype(np.int64)
    data = data[:200, :, 0:750]

    unique = np.unique(labels, return_counts=False)
    data_params = dict(n_classes=len(unique),
                       n_chans=6,
                       input_time_length=subj.epoch)  #n_chans = subj.n_chans

    #w = windows(data, subj, 500, 250, 500)  # fs = subj.sfreq # list of windows

    num_folds = 2
    skf = StratifiedKFold(
        n_splits=num_folds, shuffle=False,
        random_state=10)  # don't randomize trials to preserce structure

    trainsetlist, testsetlist = [], []
    inner_fold_acc, inner_fold_loss, inner_fold_CE = [], [], []

    subj_results = Results(
        subject, filename,
        num_folds)  #, class_names=["apple", "orange", "car", "bus"]
    subj_results.change_directory(direct)

    subj_results.get_acc_loss_df(
        hyp_params, 'Fold')  # empty dataframe headed with each HP set

    clf = Classification(hyp_params, parameters, data_params, "01", "shallow",
                         "words")  # classifier object

    print(f"Inner-fold training for Subject {subject} in progress...")
    for inner_ind, outer_index in skf.split(data, labels):
        inner_fold, outer_fold = data[inner_ind], data[outer_index]
        inner_labels, outer_labels = labels[inner_ind], labels[outer_index]
        subj_results.concat_y_true(outer_labels)

        trainsetlist.append(SignalAndTarget(
            inner_fold, inner_labels))  # used for outer-fold train/test
        testsetlist.append(SignalAndTarget(outer_fold, outer_labels))

        for train_idx, valid_idx in skf.split(inner_fold, inner_labels):
            X_Train, X_val = inner_fold[train_idx], inner_fold[valid_idx]
            y_train, y_val = inner_labels[train_idx], inner_labels[valid_idx]
            train_set = SignalAndTarget(X_Train, y_train)
            val_set = SignalAndTarget(X_val, y_val)

            hyp_param_acc, hyp_param_loss = [], []
            hyp_param_acc, hyp_param_loss, hyp_param_CE = clf.train_inner(
                train_set, val_set, None, False)

            inner_fold_loss.append(hyp_param_loss)
            inner_fold_acc.append(hyp_param_acc)
            inner_fold_CE.append(hyp_param_CE)

    subj_results.fill_acc_loss_df(inner_fold_acc, inner_fold_loss,
                                  inner_fold_CE)

    subj_results.get_hp_means(
        hyp_params, "accuracy")  #needed to select inter-subject parameters

    subj_results.get_best_params("accuracy")
    clf.best_params = subj_results.best_params
    clf.set_best_params()
    print(f"Best parameters selected: {clf.best_params}")
    print(
        "///////-------------------------------------------------------///////"
    )
    print(
        f"Outer-fold training and testing for Subject {subject} in progress..."
    )
    scores, fold_models, predictions, probabilities, outer_cross_entropy = clf.train_outer(
        trainsetlist, testsetlist, False
    )  #accuracy score for each fold, combined predictions for each fold

    subj_results.outer_fold_accuracies = scores
    subj_results.y_pred = np.array(predictions)
    subj_results.y_probs = np.array(probabilities)
    subj_results.outer_fold_cross_entropies = outer_cross_entropy

    subj_results.train_loss, subj_results.valid_loss, subj_results.test_loss, subj_results.train_acc, subj_results.valid_acc, subj_results.test_acc = get_model_loss_and_acc(
        fold_models)

    subj_results.save_result()

    subj_results.subject_stats()
    print("")
    print(subj_results.subject_stats_df.head())
Exemplo n.º 12
0
    four_digit["parent_code"] = four_digit.code.apply(lambda x: x[:2])
    four_digit = four_digit.drop("community", axis=1)
    four_digit["level"] = "4digit"

    two_digit = hs4.iloc[1241:1339]
    two_digit["code"] = two_digit.code.astype(str).str.zfill(2)
    two_digit = two_digit.rename(columns={"community": "parent_code"})
    two_digit["parent_code"] = two_digit.parent_code.astype(str).str.zfill(3)
    two_digit["level"] = "2digit"

    section = hs4.iloc[1339:].drop("community", axis=1)
    section["code"] = section.code.astype(str).str.zfill(3)
    section["parent_code"] = None
    section["level"] = "section"

    hs_clean = pd.concat([section, two_digit, four_digit])
    hs_clean = hs_clean.reset_index(drop=True)

    h = Hierarchy(["section", "2digit", "4digit"])
    hs_clean = parent_code_table_to_parent_id_table(hs_clean, h)
    c = Classification(hs_clean, h)

    #community = pd.read_table("in/hs4_community.tsv", encoding="utf-8")
    #hs4 = hs4.merge(community, left_on="community", right_on="code", how="inner")

    # weird bug where pandas infer_type was returning mixed instead of string
    c.table.code = c.table.code.astype(str)

    c.to_csv("out/hs92_atlas.csv")
    c.to_stata("out/hs92_atlas.dta")
Exemplo n.º 13
0
from classification import Classification
from pprint import pprint


client_id = 'id'
client_secret = 'secret'

c = Classification(client_id, client_secret)

pprint(c.find_student_classification('MI-PYT', 'laskobor'))
Exemplo n.º 14
0
    df.level = df.level.astype("category", categories=h, ordered=True)

    df = df.sort_values(by=["level", "code"])

    df.level = df.level.astype(str)
    df = df.reset_index(drop=True)
    parent_id_table = parent_code_table_to_parent_id_table(df, h)

    # TODO: This isn't the official classification level name but this makes
    # compatibility between colombia and mexico way easier
    # parent_code_table.loc[parent_code_table.level == "state", "level"] = "department"

    # Drop the "locality" level since we don't use it
    # parent_code_table = parent_code_table[parent_code_table.level != "locality"]

    parent_id_table = parent_id_table[[
        "code",
        "name",
        "level",
        "name_es",
        "name_en",
        "name_short_es",
        "name_short_en",
        "parent_id",
    ]]

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_peru_inei.csv")
    c.to_stata("out/locations_peru_inei.dta")
Exemplo n.º 15
0
    def __init__(
            self,
            method=None,
            data_fold=None,
            full_dataset=True,
            metal_groupThres=0.1,
            thatch_groupThres=0.1,
            groupBounds=False,
            erosion=0,
            suppress=None,
            pickle_viola=None,  # single_detector=True, 
            in_path=None,
            out_path=None,
            neural=None,
            ensemble=None,
            detector_params=None,
            pipe=None,
            out_folder_name=None,
            net_threshold=0.5):
        '''
        Parameters:
        ------------------
        groupThres bool
            Decides if we should do grouping on neural detections
        method:string
            Can be either 'viola' or 'sliding_window'
        '''
        assert method == 'viola' or method == 'slide'

        self.method = method
        self.full_dataset = full_dataset
        self.data_fold = data_fold

        self.groupThres = dict()
        self.groupThres['thatch'] = float(metal_groupThres)
        self.groupThres['metal'] = float(thatch_groupThres)
        self.groupBounds = groupBounds
        self.erosion = erosion

        #self.single_detector = single_detector
        self.in_path = in_path
        if DEBUG:
            self.img_names = [
                img_name for img_name in os.listdir(self.in_path)
                if img_name.endswith('jpg')
            ][:1]
        else:
            self.img_names = [
                img_name for img_name in os.listdir(self.in_path)
                if img_name.endswith('jpg')
            ]
        self.out_path = out_path

        #Setup Viola: if we are given an evaluation directly, don't bother running viola
        if self.method == 'viola':
            self.pickle_viola = pickle_viola
            if self.pickle_viola is None:
                self.viola = ViolaDetector(pipeline=True,
                                           out_path=out_path,
                                           in_path=in_path,
                                           folder_name=out_folder_name,
                                           save_imgs=True,
                                           **detector_params)
            else:
                with open(pickle_viola, 'rb') as f:
                    self.viola_evaluation = pickle.load(f)
                    self.viola_evaluation.in_path = self.in_path
                    self.viola_evaluation.out_path = self.out_path

        #Setup the sliding window
        elif self.method == 'slide':
            self.slider = SlidingWindowNeural(full_dataset=self.full_dataset,
                                              in_path=self.in_path,
                                              out_path=self.out_path,
                                              **detector_params)
        else:
            raise ValueError('Need to specific either viola or sliding window')

        self.ensemble = ensemble

        #EVALUATION OBJECTS
        self.auc_thresholds = [.5]
        self.detections_after_neural = list()
        self.evaluation_after_neural = list()
        detector_names = detector_params[
            'detector_names'] if self.method == 'viola' else None
        for thres in self.auc_thresholds:
            detections = Detections()
            self.detections_after_neural.append(detections)
            self.evaluation_after_neural.append(
                Evaluation(detections=detections,
                           method='pipeline',
                           save_imgs=True,
                           out_path=self.out_path,
                           auc_threshold=thres,
                           folder_name=out_folder_name,
                           in_path=self.in_path,
                           detector_names=detector_names))

        self.auc = AucCurve(self.img_names,
                            self.evaluation_after_neural[0].correct_roofs,
                            self.out_path, self.method)
        if self.data_fold == utils.TESTING:
            self.classification = Classification(
                self.img_names, self.out_path,
                self.evaluation_after_neural[0].correct_roofs, self.method)
        print self.img_names

        self.neural_time = defaultdict(int)
        self.viola_time = defaultdict(int)
Exemplo n.º 16
0
class Pipeline(object):
    def __init__(
            self,
            method=None,
            data_fold=None,
            full_dataset=True,
            metal_groupThres=0.1,
            thatch_groupThres=0.1,
            groupBounds=False,
            erosion=0,
            suppress=None,
            pickle_viola=None,  # single_detector=True, 
            in_path=None,
            out_path=None,
            neural=None,
            ensemble=None,
            detector_params=None,
            pipe=None,
            out_folder_name=None,
            net_threshold=0.5):
        '''
        Parameters:
        ------------------
        groupThres bool
            Decides if we should do grouping on neural detections
        method:string
            Can be either 'viola' or 'sliding_window'
        '''
        assert method == 'viola' or method == 'slide'

        self.method = method
        self.full_dataset = full_dataset
        self.data_fold = data_fold

        self.groupThres = dict()
        self.groupThres['thatch'] = float(metal_groupThres)
        self.groupThres['metal'] = float(thatch_groupThres)
        self.groupBounds = groupBounds
        self.erosion = erosion

        #self.single_detector = single_detector
        self.in_path = in_path
        if DEBUG:
            self.img_names = [
                img_name for img_name in os.listdir(self.in_path)
                if img_name.endswith('jpg')
            ][:1]
        else:
            self.img_names = [
                img_name for img_name in os.listdir(self.in_path)
                if img_name.endswith('jpg')
            ]
        self.out_path = out_path

        #Setup Viola: if we are given an evaluation directly, don't bother running viola
        if self.method == 'viola':
            self.pickle_viola = pickle_viola
            if self.pickle_viola is None:
                self.viola = ViolaDetector(pipeline=True,
                                           out_path=out_path,
                                           in_path=in_path,
                                           folder_name=out_folder_name,
                                           save_imgs=True,
                                           **detector_params)
            else:
                with open(pickle_viola, 'rb') as f:
                    self.viola_evaluation = pickle.load(f)
                    self.viola_evaluation.in_path = self.in_path
                    self.viola_evaluation.out_path = self.out_path

        #Setup the sliding window
        elif self.method == 'slide':
            self.slider = SlidingWindowNeural(full_dataset=self.full_dataset,
                                              in_path=self.in_path,
                                              out_path=self.out_path,
                                              **detector_params)
        else:
            raise ValueError('Need to specific either viola or sliding window')

        self.ensemble = ensemble

        #EVALUATION OBJECTS
        self.auc_thresholds = [.5]
        self.detections_after_neural = list()
        self.evaluation_after_neural = list()
        detector_names = detector_params[
            'detector_names'] if self.method == 'viola' else None
        for thres in self.auc_thresholds:
            detections = Detections()
            self.detections_after_neural.append(detections)
            self.evaluation_after_neural.append(
                Evaluation(detections=detections,
                           method='pipeline',
                           save_imgs=True,
                           out_path=self.out_path,
                           auc_threshold=thres,
                           folder_name=out_folder_name,
                           in_path=self.in_path,
                           detector_names=detector_names))

        self.auc = AucCurve(self.img_names,
                            self.evaluation_after_neural[0].correct_roofs,
                            self.out_path, self.method)
        if self.data_fold == utils.TESTING:
            self.classification = Classification(
                self.img_names, self.out_path,
                self.evaluation_after_neural[0].correct_roofs, self.method)
        print self.img_names

        self.neural_time = defaultdict(int)
        self.viola_time = defaultdict(int)

    def run(self, img_type='inhabited', img_names=None, in_path=None):
        '''
        1. Find proposals using ViolaJones or sliding window
        2. Resize the window and classify it
        3. Net returns a list of the roof coordinates of each type - saved in roof_coords
        '''

        img_names = img_names if img_names is not None else self.img_names
        in_path = in_path if in_path is not None else self.in_path
        for i, img_name in enumerate(img_names):
            print '***************** Image {0}: {1}/{2} *****************'.format(
                img_name, i,
                len(img_names) - 1)

            #VIOLA: currently it does no scoring, we commented out in viola_detector.py
            rect_detections = dict()
            if self.method == 'viola':
                if self.pickle_viola is None:
                    img = self.viola.detect_roofs(img_name=img_name,
                                                  in_path=in_path)
                    current_viola_detections = self.viola.viola_detections
                    self.viola_time[
                        img_type] = self.viola.evaluation.detections.total_time
                else:  #use the pickled detections for speed in testing the neural network
                    current_viola_detections = self.viola_evaluation.detections
                    self.viola_time[
                        img_type] = self.viola_evaluation.detections.total_time
                proposal_patches, proposal_coords, img_shape = self.find_viola_proposals(
                    current_viola_detections,
                    img_name=img_name,
                    in_path=in_path)
                for roof_type in utils.ROOF_TYPES:
                    if len(proposal_coords[roof_type]) > 0:
                        rect_detections[roof_type] = utils.polygons2boxes(
                            proposal_coords[roof_type])
                    else:
                        rect_detections[roof_type] = np.array([])

            #SLIDING WINDOW: also does no scoring
            elif self.method == 'slide':
                with Timer() as t:
                    #get the roofs with sliding detector
                    proposal_coords, rect_detections = self.slider.get_windows(
                        img_name, in_path=in_path)
                    #convert them to patches
                    proposal_patches, img_shape = self.find_slider_proposals(
                        rect_detections, img_name=img_name, in_path=in_path)
                print 'Sliding window detection for one image took {} seconds'.format(
                    t.secs)
            else:
                print 'Unknown detection method {}'.format(self.method)
                sys.exit(-1)

            if in_path == self.in_path:
                self.print_detections(rect_detections, img_name, '_viola')

            #NEURALNET
            print 'Starting neural classification of image {}'.format(img_name)
            with Timer() as t:
                #NOTE: classified detections only has roofs with prob >= 0.5
                classified_detections, probs = self.neural_classification_AUC(
                    proposal_patches, rect_detections)
            print 'Classification took {} secs'.format(t.secs)
            self.neural_time[img_type] += t.secs

            #GROUPING
            rect_detections, probs, grouping_time = self.nonmax_suppression(
                rect_detections, probs)
            self.neural_time[img_type] += grouping_time

            #PRINTING DETECTIONS
            if in_path == self.in_path:
                self.print_detections(
                    {
                        'metal': classified_detections['metal'][0],
                        'thatch': classified_detections['thatch'][0]
                    }, img_name, '_neural')
            det = dict()
            for roof_type in utils.ROOF_TYPES:
                det[roof_type] = rect_detections[roof_type][
                    probs[roof_type] > 0.5]
            if in_path == self.in_path:
                self.print_detections(det, img_name, '_grouped')

            #AUC AND CLASSIFICATION USING THE GROUPED DETECTIONS
            #only do AUC with the inhabited images
            if in_path == self.in_path:
                self.auc.set_detections(rect_detections, img_name)
                self.auc.set_probs(probs, img_name)
            #only do classification if we are using the testing set
            if self.data_fold == utils.TESTING:
                self.classification.set_detections(rect_detections, img_name)
                self.classification.set_probs(probs, img_name)

    def print_detections(self, detections, img_name, title):
        if detections is not None:
            for roof_type, detects in detections.iteritems():
                img = cv2.imread(self.in_path + img_name)

                if img_name in self.evaluation_after_neural[0].correct_roofs[
                        roof_type]:
                    #the uninhabited images do not have an entry
                    utils.draw_detections(self.evaluation_after_neural[0].
                                          correct_roofs[roof_type][img_name],
                                          img,
                                          rects=True,
                                          color=(0, 255, 0),
                                          thickness=6)
                if detects.shape[0] > 0:
                    utils.draw_detections(detects,
                                          img,
                                          rects=True,
                                          color=(255, 0, 0),
                                          thickness=3)

                cv2.imwrite(
                    'debug/{}_{}_{}{}.jpg'.format(self.groupThres[roof_type],
                                                  img_name[:-4], roof_type,
                                                  title), img)

    def nonmax_suppression(self, rect_detections, probs):
        with Timer() as t:
            #set detections and score
            for roof_type in utils.ROOF_TYPES:
                #proper non max suppression from Felzenszwalb et al.
                if len(rect_detections[roof_type]) > 0:
                    rect_detections[roof_type], probs[
                        roof_type] = suppression.non_max_suppression(
                            rect_detections[roof_type],
                            probs[roof_type],
                            overlapThres=self.groupThres[roof_type])
        print 'Grouping took {} seconds'.format(t.secs)
        return rect_detections, probs, t.secs

    def get_correct_class_per_detection(self, rect_detections, img_name):
        #this is needed to build the Recall precision curve

        #get the best class guess of the detections by scoring it with ground truth
        self.slider.detections.set_detections(
            roof_type='thatch',
            detection_list=rect_detections['thatch'],
            img_name=img_name)
        self.slider.detections.set_detections(
            roof_type='metal',
            detection_list=rect_detections['metal'],
            img_name=img_name)
        #score the image
        self.slider.evaluation.score_img(
            img_name=img_name, img_shape=(-1, -1), fast_scoring=True
        )  #since we use fast scoring, we don't need the img_shape

        #get the proper class by looking at the best score for each detection
        correct_classes = dict()
        for roof_type in utils.ROOF_TYPES:
            correct_classes[roof_type] = np.zeros(
                (len(rect_detections[roof_type])))
            for d, (detection, score) in enumerate(
                    self.slider.detections.best_score_per_detection[img_name]
                [roof_type]):
                correct_classes[roof_type][d] = 0 if score < 0.5 else 1
            correct_classes[roof_type] = list(correct_classes[roof_type])
        return correct_classes

    def group_min_bound(self, polygons, img_shape, erosion=0):
        '''
        Attempt at finding the minbound of all overlapping rects and merging them
        to a single detection. This unfortunately will merge nearby roofs.
        '''
        bitmap = np.zeros(img_shape, dtype='uint8')
        utils.draw_detections(np.array(polygons), bitmap, fill=True, color=1)

        if erosion > 0:
            kernel = np.ones((5, 5), np.uint8)
            bitmap = cv2.erode(bitmap, kernel, iterations=erosion)

        #get contours
        contours, hierarchy = cv2.findContours(bitmap, cv2.RETR_TREE,
                                               cv2.CHAIN_APPROX_SIMPLE)

        #get the min bounding rect for the rects
        min_area_conts = [
            np.int0(cv2.cv.BoxPoints(cv2.minAreaRect(cnt))) for cnt in contours
        ]
        return min_area_conts

    def find_viola_proposals(self,
                             viola_detections,
                             img_name=None,
                             in_path=None):
        '''Call viola to find coordinates of candidate roofs. 
        Extract those patches from the image, tranform them so they can be fed to neural network.
        Return both the coordinates and the patches.
        '''
        in_path = self.in_path if in_path is None else in_path
        try:
            img_full = cv2.imread(in_path + img_name, flags=cv2.IMREAD_COLOR)
            img_shape = img_full.shape
        except IOError as e:
            print e
            sys.exit(-1)

        all_proposal_patches = dict()
        all_proposal_coords = dict()

        #extract patches for neural network classification
        for roof_type in ['metal', 'thatch']:
            all_proposal_coords[roof_type] = viola_detections.get_detections(
                img_name=img_name, roof_type=roof_type)
            #all_proposal_coords[roof_type] = self.viola.viola_detections.get_detections(img_name=img_name, roof_type=roof_type)
            patches = np.empty((len(all_proposal_coords[roof_type]), 3,
                                utils.PATCH_W, utils.PATCH_H))

            for i, detection in enumerate(all_proposal_coords[roof_type]):
                #extract the patch from the image using utils code
                img = utils.four_point_transform(img_full, detection)

                #transform the patch using utils code
                patch = utils.cv2_to_neural(img)
                patches[i, :, :, :] = patch

            all_proposal_patches[roof_type] = patches

        return all_proposal_patches, all_proposal_coords, img_shape

    def find_slider_proposals(self, slider_rects, img_name=None, in_path=None):
        #rects are in the form of (x, y, w, h)
        in_path = self.in_path if in_path is None else in_path
        try:
            img_full = cv2.imread(in_path + img_name, flags=cv2.IMREAD_COLOR)
            img_shape = img_full.shape
        except IOError as e:
            print e
            sys.exit(-1)

        all_proposal_patches = dict()

        #extract patches for neural network classification
        for roof_type in ['metal', 'thatch']:
            patches = np.empty((len(slider_rects[roof_type]), 3, utils.PATCH_W,
                                utils.PATCH_H))

            for i, rect in enumerate(slider_rects[roof_type]):
                #extract the patch from the image using utils code
                img = img_full[rect.ymin:rect.ymax, rect.xmin:rect.xmax, :]
                #transform the patch using utils code
                patch = utils.cv2_to_neural(img)
                patches[i, :, :, :] = patch

            all_proposal_patches[roof_type] = patches

        return all_proposal_patches, img_shape

    def process_viola(self, rows, cols, img_path=None, verbose=False):
        #Find candidate roof contours using Viola for all types of roof
        #returns list with as many lists of detections as the detectors we have passed
        self.viola.detect_roofs(img_name=self.img_name,
                                img_path=self.test_img_path + self.img_name)
        print 'Detected {0} candidate roofs'.format(
            len(self.viola.roofs_detected[self.img_name]))
        if verbose:
            self.viola.mark_detections_on_img(img=self.image,
                                              img_name=self.img_name)

        #get the mask and the contours for the detections
        detection_mask, _ = self.viola.get_patch_mask(img_name=self.img_name,
                                                      rows=rows,
                                                      cols=cols)
        patch_location = self.out_path + self.img_name + '_mask.jpg'
        misc.imsave(patch_location, detection_mask)

        self.all_contours[self.img_name] = self.viola.get_detection_contours(
            patch_location, self.img_name)

    def neural_classification(self, proposal_patches, proposal_coords):
        classified_detections = defaultdict(list)
        for roof_type in utils.ROOF_TYPES:
            #classify with neural network

            if proposal_patches[roof_type].shape[0] > 1:
                if self.single_detector:  #we have a single net
                    classes = np.array(
                        self.net.test(proposal_patches[roof_type]))

                    #filter according to classification
                    for detection, classification in zip(
                            proposal_coords[roof_type], classes):
                        if classification == utils.NON_ROOF:
                            classified_detections['background'].append(
                                detection)
                        elif classification == utils.METAL:
                            classified_detections['metal'].append(detection)
                        elif classification == utils.THATCH:
                            classified_detections['thatch'].append(detection)

                else:  #we have one net per roof type
                    specific_net = self.net[roof_type]
                    classes = specific_net.test(proposal_patches[roof_type])
                    #filter according to classification
                    for detection, classification in zip(
                            proposal_coords[roof_type], classes):
                        if classification == 0:
                            classified_detections['background'].append(
                                detection)
                        elif classification == 1:
                            classified_detections[roof_type].append(detection)
                        else:
                            raise ValueError('Unknown classification of patch')
            else:
                print 'No {0} detections'.format(roof_type)
        return classified_detections

    def neural_classification_AUC(self, proposal_patches, proposal_coords):
        #get the classification by evaluating it compared to the real roofs
        #get the probability of it being that type of roof
        classified_detections = dict()
        probs = dict()
        for roof_type in utils.ROOF_TYPES:
            classified_detections[roof_type] = list()
            if proposal_patches[roof_type].shape[0] > 1:
                probs[roof_type] = self.ensemble.predict_proba(
                    proposal_patches[roof_type], roof_type=roof_type)
                #different detections depending on threshold
                coords = np.array(proposal_coords[roof_type])
                for thres in self.auc_thresholds:
                    detections_logical = probs[roof_type] >= thres
                    classified_detections[roof_type].append(
                        coords[detections_logical])
            else:
                print 'No {0} detections'.format(roof_type)
                for thres in self.auc_thresholds:
                    classified_detections[roof_type].append(np.array([]))
                probs[roof_type] = np.array([])
        return classified_detections, probs

    def save_img_detections(self,
                            img_name,
                            proposal_coords,
                            predictions,
                            in_path=None):
        raise ValueError('Incorrect method')
        in_path = self.in_path if in_path is None else in_path
        img = cv2.imread(self.in_path + img_name)
        roofs = DataLoader().get_roofs(in_path + img_name[:-3] + 'xml',
                                       img_name)
        for roof in roofs:
            cv2.rectangle(img, (roof.xmin, roof.ymin),
                          (roof.xmin + roof.width, roof.ymin + roof.height),
                          (0, 255, 0), 2)
        for (x, y, w, h), accept in zip(proposal_coords['metal'],
                                        predictions[img_name]['metal']):
            color = (0, 0, 0) if accept == 1 else (0, 0, 255)
            cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
        cv2.imwrite(self.out_path + img_name, img)
Exemplo n.º 17
0
    h = Hierarchy(["department", "municipality", "population_center"])

    df = df.rename(
        columns={
            "department_name": "name_department",
            "municipality_name": "name_municipality",
            "population_center_name": "name_population_center",
        })

    parent_code_table = repeated_table_to_parent_id_table(
        df,
        h,
        level_fields={
            "department": ["name_department"],
            "municipality": ["name_municipality"],
            "population_center": ["name_population_center"],
        },
    )
    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)

    # Reorder columns to keep diff clean
    parent_id_table = parent_id_table.ix[:, [
        "code", "name", "level", "parent_id"
    ]]

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_colombia_dane.csv")
    c.to_stata("out/locations_colombia_dane.dta")
Exemplo n.º 18
0
 def __init__(self,project,clustering_alg=None):
     Classification.__init__(self,project,clustering_alg)
Exemplo n.º 19
0
#-*- coding:utf-8 -*-
# AUTHOR:   yaolili
# FILE:     runClassification.py
# ROLE:     run classifier in Classification and get the result of prediction
# CREATED:  2015-12-15 09:28:02
# MODIFIED: 2015-12-15 09:28:03

import sys
import os
from classification import Classification

if __name__ == "__main__":
    if len(sys.argv) < 5:
        print "sys.argv[1]: classifier"
        print "sys.argv[2]: trainFile"
        print "sys.argv[3]: devFile"
        print "sys.argv[4]: outputFile"
        exit()
        
    cfInstance = Classification(sys.argv[1], sys.argv[2], sys.argv[3])
    cfInstance.getPreResult(sys.argv[4])
Exemplo n.º 20
0
    if len(sys.argv) > 1:
        print 'Parsing...',
        sys.stdout.flush()
        p = Parse(sys.argv[1])
        p.compute_fqdn()
        print 'DONE'

        print 'Computing features (Can take some time because of whois queries)...',
        sys.stdout.flush()
        features = Features(p)
        features.compute()
        print 'DONE'

        print 'Classification...',
        sys.stdout.flush()
        classification = Classification(features, p)
        classification.compute()
        print 'DONE'

        print 'Launching webserver...',
        sys.stdout.flush()
        flask_app = Flask('caphaw-dns-classifier')
        print 'DONE'

        @flask_app.route('/')
        def index():
            return render_template('index.html',
                X=features.X,
                X_scaled=features.X_scaled,
                features_list=[features.features_list]*len(features.X_scaled),
                all=sorted(classification.all),
Exemplo n.º 21
0
            scoreRR, PvalueRR = regression.fnRANSACRegressor(
                yearList, avgTempList, predictYear)
            scoreGP, PvalueGP = regression.fnGaussianProcessRegressor(
                yearList, avgTempList, predictYear)
            scoreSV, PvalueSV = regression.fnSVR(yearList, avgTempList,
                                                 predictYear)

            score = np.array(
                [scoreReg, scoreIso, scoreBR, scoreRR, scoreGP, scoreSV])
            pValue = np.array(
                [PvalueReg, PvalueIso, PvalueBR, PvalueRR, PvalueGP, PvalueSV])

            pValue = pValue[np.logical_not(np.isnan(pValue))]
            score = score[np.logical_not(np.isnan(pValue))]

            maxScoreIndex = np.argmax(score)

        return dumps({"avgTemp": pValue[maxScoreIndex]})
    except:
        return "error"


if __name__ == '__main__':
    global dataFrame
    global regression
    global classification
    dataFrame = DataParser()
    regression = Regression()
    classification = Classification()
    app.run()
Exemplo n.º 22
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import time
import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from classification import Classification


if __name__ == "__main__":

    print('iris data')
    iris = datasets.load_iris()

    # 学習データとテストデータを3:1に分割
    train, test, train_label, test_label = train_test_split(iris.data, iris.target, test_size=0.25, random_state=0)
    clf = Classification(train, train_label)  # 学習データをセット
    clf.set_test(test, test_label)  # テストデータをセット
    clf.svm_gridsearch(5)  # 5交差でグリッドサーチ
    clf.cv(5)  # 5交差検定の結果を表示
    print("test Result")
    clf.prediction()  # テストデータの分類結果を表示
Exemplo n.º 23
0
        for i, c in enumerate(self.classes):
            X_c = np.array(
                [self.X[i] for i, yy in enumerate(self.y) if yy == c])
            X_c_num = X_c.shape[0]
            X_c_mean = np.mean(X_c, axis=0)
            X_c_var = np.var(X_c, axis=0)
            self.parameter[c] = (X_c_num, X_c_mean, X_c_var)

    def predict(self, X):
        posterior = [
            self.parameter[c][0] + self.pdfSum(X, c) for c in self.classes
        ]

        return self.classes[np.argmax(posterior, axis=0)]

    def pdfSum(self, X, c):
        mu = self.parameter[c][1]
        sigma = self.parameter[c][2]
        nu = np.exp(-(X - mu)**2 / (2 * sigma))
        de = np.sqrt(2 * np.pi * sigma) + 1e-5
        return np.sum(np.log(nu / de), axis=1)


if __name__ == '__main__':
    from classification import Classification

    clf = Classification()
    mm = NaiveBayes()
    mm.fit(clf.X_train, clf.y_train)
    print(mm.predict(clf.X_test))
Exemplo n.º 24
0
                  how="left")

    # Merge in region codes
    alpha3_to_region = pd.read_csv("./in/countries_to_regions.csv",
                                   dtype={"parent_code": str})
    df = df.merge(alpha3_to_region, on="code_alpha3", how="left")

    # Add custom codes
    custom_codes = pd.read_csv("./in/custom-codes.csv",
                               dtype={"parent_code": str})
    df = pd.concat([df, custom_codes]).reset_index(drop=True)

    df["level"] = "country"

    # Add region code level
    region_codes = pd.read_table("./in/regions.tsv", dtype={"code": str})
    region_codes["code_alpha2"] = region_codes["code"]
    region_codes["code_alpha3"] = region_codes["code"]
    region_codes["code_numeric"] = region_codes["code"]
    region_codes = region_codes.drop("code", axis=1)
    df = pd.concat([df, region_codes]).reset_index(drop=True)

    h = Hierarchy(["region", "country"])
    df["name"] = df["name_en"]
    df["code"] = df["code_alpha3"]
    df = parent_code_table_to_parent_id_table(df, h)

    # Alpha3 classification
    df["code"] = df["code_alpha3"]
    Classification(df, h).to_csv("out/locations_international_iso_cid.csv")
Exemplo n.º 25
0
    hierarchy = pd.read_table("./in/FarmSize_Hierarchy.tsv", encoding="utf-8")
    hierarchy.columns = ["level1_code", "level0_code"]

    fields = {"level0": [], "level1": []}

    h = Hierarchy(["level0", "level1"])
    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
    parent_code_table.code = parent_code_table.code.astype(str)

    parent_code_table = parent_code_table.merge(names, on=["code", "level"])

    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)
    parent_id_table["name"] = parent_id_table.name_en

    parent_id_table = parent_id_table[[
        "code",
        "name",
        "level",
        "name_en",
        "name_es",
        "name_short_en",
        "name_short_es",
        "parent_id",
    ]]

    c = Classification(parent_id_table, h)

    c.to_csv("out/farm_size.csv")
    c.to_stata("out/farm_size.dta")
Exemplo n.º 26
0
                  "latitude", "longitude", "altitude",
                  "map_code", "ambito",
                  "population_total", "population_male", "population_female",
                  "dwellings_occupied"]

    df = df[["state_code", "state_name", "municipality_code",
             "municipality_name", "locality_code",
             "locality_name"]]


    df.state_code = df.state_code.astype(str).str.zfill(2)
    df.municipality_code = df.municipality_code.astype(str).str.zfill(3)
    df.locality_code = df.locality_code.astype(str).str.zfill(4)

    df.municipality_code = df.state_code + df.municipality_code
    df.locality_code = df.municipality_code + df.locality_code

    df.state_name = df.state_name.str.title()
    df.municipality_name = df.municipality_name.str.title()
    df.locality_name = df.locality_name.str.title()

    h = Hierarchy(["state", "municipality", "locality"])

    parent_code_table = repeated_table_to_parent_id_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_mexico_inegi.csv")
    c.to_stata("out/locations_mexico_inegi.dta")
Exemplo n.º 27
0
    def classify(self):
        print("\nclassify")

        self.process_classification_data()
        self.split_data(0.7)

        c = Classification(self.training_data, self.test_data)
        c.decision_tree_classifier()
        c.random_forest_classifier()
        c.naive_bayes()
        c.logistic_regression()
        c.gbtc()
        c.lsvc()
Exemplo n.º 28
0

if __name__ == "__main__":
    assert(len(sys.argv) == 3)

    file_name = sys.argv[1]
    new_file_prefix = sys.argv[2]

    df = pd.read_table(file_name, encoding="utf-16")
    df = parse_dane(df)
    df = df[~df.duplicated(["code"])]
    df = df.reset_index(drop=True)
    df.columns = ["name", "level", "code"]

    df.name = df.name.str.title()

    from classification import (parent_code_table_to_parent_id_table,
                                Classification, Hierarchy,
                                ordered_table_to_parent_code_table)

    h = Hierarchy(DANE_HIERARCHY)
    df = ordered_table_to_parent_code_table(df, h)
    df = parent_code_table_to_parent_id_table(df, h)
    c = Classification(df, h)

    # weird bug where pandas infer_type was returning mixed instead of string
    c.table.code = c.table.code.astype(str)

    c.to_csv(new_file_prefix + ".csv")
    c.to_stata(new_file_prefix + ".dta")
class Pipeline(object):
    def __init__(self, method=None,
                    data_fold=None,
                    full_dataset=True,
                    metal_groupThres=0.1, thatch_groupThres=0.1, 
                    groupBounds=False, erosion=0, suppress=None, 
                    pickle_viola=None,# single_detector=True, 
                    in_path=None, out_path=None, neural=None, 
                    ensemble=None, 
                    detector_params=None, pipe=None, out_folder_name=None, net_threshold=0.5):
        '''
        Parameters:
        ------------------
        groupThres bool
            Decides if we should do grouping on neural detections
        method:string
            Can be either 'viola' or 'sliding_window'
        '''
        assert method=='viola' or method=='slide'

        self.method = method
        self.full_dataset = full_dataset
        self.data_fold = data_fold

        self.groupThres = dict()
        self.groupThres['thatch'] = float(metal_groupThres)
        self.groupThres['metal'] = float(thatch_groupThres)
        self.groupBounds = groupBounds
        self.erosion = erosion

        #self.single_detector = single_detector
        self.in_path = in_path
        if DEBUG:
            self.img_names = [img_name for img_name in os.listdir(self.in_path) if img_name.endswith('jpg')][:1]
        else:
            self.img_names = [img_name for img_name in os.listdir(self.in_path) if img_name.endswith('jpg')]
        self.out_path = out_path
        
        #Setup Viola: if we are given an evaluation directly, don't bother running viola 
        if self.method == 'viola':
            self.pickle_viola = pickle_viola
            if self.pickle_viola is None:
                self.viola = ViolaDetector(pipeline=True, out_path=out_path, 
                                            in_path=in_path,
                                            folder_name=out_folder_name,
                                            save_imgs=True, **detector_params) 
            else:
                with open(pickle_viola, 'rb') as f:
                    self.viola_evaluation = pickle.load(f) 
                    self.viola_evaluation.in_path = self.in_path
                    self.viola_evaluation.out_path = self.out_path


        #Setup the sliding window
        elif self.method == 'slide':
            self.slider = SlidingWindowNeural(full_dataset=self.full_dataset, in_path=self.in_path, out_path=self.out_path, **detector_params) 
        else:
            raise ValueError('Need to specific either viola or sliding window')

        self.ensemble = ensemble

        #EVALUATION OBJECTS
        self.auc_thresholds = [.5]
        self.detections_after_neural = list()
        self.evaluation_after_neural = list()
        detector_names = detector_params['detector_names'] if self.method=='viola' else None
        for thres in self.auc_thresholds:
            detections = Detections()
            self.detections_after_neural.append(detections)
            self.evaluation_after_neural.append(Evaluation(detections=detections, 
                                        method='pipeline', save_imgs=True, out_path=self.out_path,
                                        auc_threshold=thres, folder_name=out_folder_name, in_path=self.in_path, detector_names=detector_names))

        self.auc = AucCurve(self.img_names, self.evaluation_after_neural[0].correct_roofs, self.out_path, self.method)
        if self.data_fold == utils.TESTING:
            self.classification = Classification(self.img_names, self.out_path, self.evaluation_after_neural[0].correct_roofs, self.method)
        print self.img_names

        self.neural_time = defaultdict(int)
        self.viola_time = defaultdict(int)


    def run(self, img_type='inhabited', img_names=None, in_path=None):
        '''
        1. Find proposals using ViolaJones or sliding window
        2. Resize the window and classify it
        3. Net returns a list of the roof coordinates of each type - saved in roof_coords
        '''

        img_names = img_names if img_names is not None else self.img_names
        in_path = in_path if in_path is not None else self.in_path
        for i, img_name in enumerate(img_names):
            print '***************** Image {0}: {1}/{2} *****************'.format(img_name, i, len(img_names)-1)

            #VIOLA: currently it does no scoring, we commented out in viola_detector.py
            rect_detections = dict()
            if self.method == 'viola':
                if self.pickle_viola is None:
                    img = self.viola.detect_roofs(img_name=img_name, in_path=in_path)
                    current_viola_detections = self.viola.viola_detections 
                    self.viola_time[img_type] = self.viola.evaluation.detections.total_time
                else:#use the pickled detections for speed in testing the neural network
                    current_viola_detections = self.viola_evaluation.detections
                    self.viola_time[img_type] = self.viola_evaluation.detections.total_time 
                proposal_patches, proposal_coords, img_shape = self.find_viola_proposals(current_viola_detections, img_name=img_name, in_path=in_path)
                for roof_type in utils.ROOF_TYPES:
                    if len(proposal_coords[roof_type]) > 0:
                        rect_detections[roof_type] = utils.polygons2boxes(proposal_coords[roof_type])
                    else:
                        rect_detections[roof_type] = np.array([])

            #SLIDING WINDOW: also does no scoring
            elif self.method == 'slide':
                with Timer() as t:
                    #get the roofs with sliding detector
                    proposal_coords, rect_detections = self.slider.get_windows(img_name, in_path=in_path) 
                    #convert them to patches
                    proposal_patches, img_shape = self.find_slider_proposals(rect_detections, img_name=img_name, in_path=in_path)
                print 'Sliding window detection for one image took {} seconds'.format(t.secs)
            else:
                print 'Unknown detection method {}'.format(self.method)
                sys.exit(-1)

            if in_path == self.in_path:
                self.print_detections(rect_detections, img_name, '_viola')
           
            #NEURALNET
            print 'Starting neural classification of image {}'.format(img_name)
            with Timer() as t:
                #NOTE: classified detections only has roofs with prob >= 0.5
                classified_detections, probs  = self.neural_classification_AUC(proposal_patches, rect_detections) 
            print 'Classification took {} secs'.format(t.secs)
            self.neural_time[img_type] += t.secs

            #GROUPING
            rect_detections, probs, grouping_time  = self.nonmax_suppression(rect_detections, probs)   
            self.neural_time[img_type] += grouping_time
            
            #PRINTING DETECTIONS
            if in_path == self.in_path:
                self.print_detections({'metal':classified_detections['metal'][0],'thatch':classified_detections['thatch'][0]}, img_name, '_neural')
            det = dict()
            for roof_type in utils.ROOF_TYPES:
                det[roof_type] = rect_detections[roof_type][probs[roof_type]>0.5]
            if in_path == self.in_path:
                self.print_detections(det, img_name, '_grouped')

            #AUC AND CLASSIFICATION USING THE GROUPED DETECTIONS
            #only do AUC with the inhabited images
            if in_path == self.in_path:
                self.auc.set_detections(rect_detections, img_name)
                self.auc.set_probs(probs, img_name)
            #only do classification if we are using the testing set
            if self.data_fold == utils.TESTING:
                self.classification.set_detections(rect_detections, img_name)
                self.classification.set_probs(probs, img_name)


    def print_detections(self, detections, img_name, title):
        if detections is not None:
            for roof_type, detects in detections.iteritems():
                img = cv2.imread(self.in_path+img_name)

                if img_name in self.evaluation_after_neural[0].correct_roofs[roof_type]:
                    #the uninhabited images do not have an entry
                    utils.draw_detections(self.evaluation_after_neural[0].correct_roofs[roof_type][img_name], img, rects=True, color=(0,255,0), thickness=6)
                if detects.shape[0] > 0: 
                    utils.draw_detections(detects, img, rects=True, color=(255,0,0), thickness=3)

                cv2.imwrite('debug/{}_{}_{}{}.jpg'.format(self.groupThres[roof_type], img_name[:-4],roof_type, title), img)


    def nonmax_suppression(self, rect_detections, probs):
        with Timer() as t:
            #set detections and score
            for roof_type in utils.ROOF_TYPES:
                #proper non max suppression from Felzenszwalb et al.
                if len(rect_detections[roof_type]) > 0:
                    rect_detections[roof_type], probs[roof_type] = suppression.non_max_suppression(rect_detections[roof_type], 
                                        probs[roof_type], overlapThres = self.groupThres[roof_type])
        print 'Grouping took {} seconds'.format(t.secs)
        return rect_detections, probs, t.secs



    def get_correct_class_per_detection(self,rect_detections, img_name): 
        #this is needed to build the Recall precision curve

        #get the best class guess of the detections by scoring it with ground truth
        self.slider.detections.set_detections(roof_type='thatch', detection_list=rect_detections['thatch'], img_name=img_name)
        self.slider.detections.set_detections(roof_type='metal', detection_list=rect_detections['metal'], img_name=img_name)
        #score the image
        self.slider.evaluation.score_img(img_name=img_name, img_shape=(-1,-1), fast_scoring=True) #since we use fast scoring, we don't need the img_shape
        
        #get the proper class by looking at the best score for each detection
        correct_classes = dict()
        for roof_type in utils.ROOF_TYPES:
            correct_classes[roof_type] = np.zeros((len(rect_detections[roof_type]))) 
            for d, (detection, score) in enumerate(self.slider.detections.best_score_per_detection[img_name][roof_type]):
                correct_classes[roof_type][d] = 0 if score<0.5 else 1
            correct_classes[roof_type] = list(correct_classes[roof_type])
        return correct_classes


    def group_min_bound(self, polygons, img_shape, erosion=0):
        '''
        Attempt at finding the minbound of all overlapping rects and merging them
        to a single detection. This unfortunately will merge nearby roofs.
        '''
        bitmap = np.zeros(img_shape, dtype='uint8')
        utils.draw_detections(np.array(polygons), bitmap, fill=True, color=1)

        if erosion>0:
            kernel = np.ones((5,5),np.uint8)
            bitmap = cv2.erode(bitmap,kernel,iterations = erosion)

        #get contours
        contours, hierarchy = cv2.findContours(bitmap, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

        #get the min bounding rect for the rects
        min_area_conts = [np.int0(cv2.cv.BoxPoints(cv2.minAreaRect(cnt))) for cnt in contours]
        return min_area_conts


    def find_viola_proposals(self, viola_detections, img_name=None, in_path=None):
        '''Call viola to find coordinates of candidate roofs. 
        Extract those patches from the image, tranform them so they can be fed to neural network.
        Return both the coordinates and the patches.
        '''
        in_path = self.in_path if in_path is None else in_path
        try:
            img_full = cv2.imread(in_path+img_name, flags=cv2.IMREAD_COLOR)
            img_shape = img_full.shape
        except IOError as e:
            print e
            sys.exit(-1)

        all_proposal_patches = dict()
        all_proposal_coords = dict()
        
        #extract patches for neural network classification
        for roof_type in ['metal', 'thatch']: 
            all_proposal_coords[roof_type] = viola_detections.get_detections(img_name=img_name, roof_type=roof_type)
            #all_proposal_coords[roof_type] = self.viola.viola_detections.get_detections(img_name=img_name, roof_type=roof_type)
            patches = np.empty((len(all_proposal_coords[roof_type]), 3, utils.PATCH_W, utils.PATCH_H)) 

            for i, detection in enumerate(all_proposal_coords[roof_type]): 
                #extract the patch from the image using utils code
                img = utils.four_point_transform(img_full, detection)

                #transform the patch using utils code
                patch = utils.cv2_to_neural(img)
                patches[i, :, :,:] = patch 

            all_proposal_patches[roof_type] = patches  

        return all_proposal_patches, all_proposal_coords, img_shape


    def find_slider_proposals(self, slider_rects, img_name=None, in_path=None):
        #rects are in the form of (x, y, w, h)
        in_path = self.in_path if in_path is None else in_path
        try:
            img_full = cv2.imread(in_path+img_name, flags=cv2.IMREAD_COLOR)
            img_shape = img_full.shape
        except IOError as e:
            print e
            sys.exit(-1)

        all_proposal_patches = dict()
        
        #extract patches for neural network classification
        for roof_type in ['metal', 'thatch']: 
            patches = np.empty((len(slider_rects[roof_type]), 3, utils.PATCH_W, utils.PATCH_H)) 

            for i, rect in enumerate(slider_rects[roof_type]): 
                #extract the patch from the image using utils code
                img = img_full[rect.ymin:rect.ymax, rect.xmin:rect.xmax, :]
                #transform the patch using utils code
                patch = utils.cv2_to_neural(img)
                patches[i, :, :,:] = patch 

            all_proposal_patches[roof_type] = patches  

        return all_proposal_patches, img_shape



    def process_viola(self, rows, cols, img_path=None, verbose=False):
        #Find candidate roof contours using Viola for all types of roof
        #returns list with as many lists of detections as the detectors we have passed
        self.viola.detect_roofs(img_name=self.img_name, img_path=self.test_img_path+self.img_name)
        print 'Detected {0} candidate roofs'.format(len(self.viola.roofs_detected[self.img_name]))
        if verbose:
            self.viola.mark_detections_on_img(img=self.image, img_name=self.img_name)

        #get the mask and the contours for the detections
        detection_mask, _ = self.viola.get_patch_mask(img_name=self.img_name, rows=rows, cols=cols)
        patch_location = self.out_path+self.img_name+'_mask.jpg'
        misc.imsave(patch_location, detection_mask)

        self.all_contours[self.img_name] = self.viola.get_detection_contours(patch_location, self.img_name)
 

    def neural_classification(self, proposal_patches, proposal_coords):
        classified_detections = defaultdict(list)
        for roof_type in utils.ROOF_TYPES:
            #classify with neural network
            
            if proposal_patches[roof_type].shape[0] > 1:
                if self.single_detector: #we have a single net
                    classes = np.array(self.net.test(proposal_patches[roof_type]))

                    #filter according to classification         
                    for detection, classification in zip(proposal_coords[roof_type], classes):
                        if classification == utils.NON_ROOF:
                            classified_detections['background'].append(detection)
                        elif classification == utils.METAL:
                            classified_detections['metal'].append(detection)
                        elif classification == utils.THATCH:
                            classified_detections['thatch'].append(detection)

                else: #we have one net per roof type
                    specific_net = self.net[roof_type]
                    classes = specific_net.test(proposal_patches[roof_type])
                     #filter according to classification         
                    for detection, classification in zip(proposal_coords[roof_type], classes):
                        if classification == 0:
                            classified_detections['background'].append(detection)
                        elif classification == 1:
                            classified_detections[roof_type].append(detection)
                        else:
                            raise ValueError('Unknown classification of patch')
            else:
                print 'No {0} detections'.format(roof_type)
        return classified_detections



    def neural_classification_AUC(self, proposal_patches, proposal_coords):
        #get the classification by evaluating it compared to the real roofs
        #get the probability of it being that type of roof
        classified_detections = dict()
        probs = dict()
        for roof_type in utils.ROOF_TYPES:
            classified_detections[roof_type] = list()
            if proposal_patches[roof_type].shape[0] > 1:
                probs[roof_type] = self.ensemble.predict_proba(proposal_patches[roof_type], roof_type=roof_type)
                #different detections depending on threshold
                coords = np.array(proposal_coords[roof_type])
                for thres in self.auc_thresholds:
                    detections_logical = probs[roof_type]>=thres   
                    classified_detections[roof_type].append(coords[detections_logical]) 
            else:
                print 'No {0} detections'.format(roof_type)
                for thres in self.auc_thresholds:
                    classified_detections[roof_type].append(np.array([]))
                probs[roof_type] = np.array([])
        return classified_detections, probs 
 

    def save_img_detections(self, img_name, proposal_coords, predictions, in_path=None):
        raise ValueError('Incorrect method')
        in_path = self.in_path if in_path is None else in_path
        img = cv2.imread(self.in_path+img_name)
        roofs = DataLoader().get_roofs(in_path+img_name[:-3]+'xml', img_name)
        for roof in roofs:
            cv2.rectangle(img, (roof.xmin, roof.ymin), (roof.xmin+roof.width, roof.ymin+roof.height), (0,255,0), 2)
        for (x,y,w,h), accept in zip(proposal_coords['metal'], predictions[img_name]['metal']):
            color = (0,0,0) if accept==1 else (0,0,255) 
            cv2.rectangle(img, (x,y), (x+w, y+h), color, 2) 
        cv2.imwrite(self.out_path+img_name, img)
Exemplo n.º 30
0
    df = (pd.read_csv(
        "./in/Mexico Country codes - continents - Countries.csv",
        encoding="utf-8",
        dtype={
            "continent_code": str
        },
    ).rename(columns={
        "continent_code": "parent_code"
    }).drop("total_export", axis=1))
    df["level"] = "country"

    regions = pd.read_table(
        "./in/Mexico Country codes - continents - Continents - Regions.tsv",
        encoding="utf-8",
    ).rename(columns={"name": "name_en"})
    regions["name_short_en"] = regions["name_en"]
    regions["name_short_es"] = regions["name_es"]
    regions["level"] = "region"
    regions["code"] = regions["code"].astype(unicode)

    df = pd.concat([df, regions]).reset_index(drop=True)

    h = Hierarchy(["region", "country"])
    parent_id_table = parent_code_table_to_parent_id_table(df, h)
    parent_id_table["name"] = parent_id_table["name_en"]

    c = Classification(parent_id_table, h)
    c.to_csv("out/locations_international_mexico.csv")
    c.to_stata("out/locations_international_mexico.dta")
    def __init__(self, method=None,
                    data_fold=None,
                    full_dataset=True,
                    metal_groupThres=0.1, thatch_groupThres=0.1, 
                    groupBounds=False, erosion=0, suppress=None, 
                    pickle_viola=None,# single_detector=True, 
                    in_path=None, out_path=None, neural=None, 
                    ensemble=None, 
                    detector_params=None, pipe=None, out_folder_name=None, net_threshold=0.5):
        '''
        Parameters:
        ------------------
        groupThres bool
            Decides if we should do grouping on neural detections
        method:string
            Can be either 'viola' or 'sliding_window'
        '''
        assert method=='viola' or method=='slide'

        self.method = method
        self.full_dataset = full_dataset
        self.data_fold = data_fold

        self.groupThres = dict()
        self.groupThres['thatch'] = float(metal_groupThres)
        self.groupThres['metal'] = float(thatch_groupThres)
        self.groupBounds = groupBounds
        self.erosion = erosion

        #self.single_detector = single_detector
        self.in_path = in_path
        if DEBUG:
            self.img_names = [img_name for img_name in os.listdir(self.in_path) if img_name.endswith('jpg')][:1]
        else:
            self.img_names = [img_name for img_name in os.listdir(self.in_path) if img_name.endswith('jpg')]
        self.out_path = out_path
        
        #Setup Viola: if we are given an evaluation directly, don't bother running viola 
        if self.method == 'viola':
            self.pickle_viola = pickle_viola
            if self.pickle_viola is None:
                self.viola = ViolaDetector(pipeline=True, out_path=out_path, 
                                            in_path=in_path,
                                            folder_name=out_folder_name,
                                            save_imgs=True, **detector_params) 
            else:
                with open(pickle_viola, 'rb') as f:
                    self.viola_evaluation = pickle.load(f) 
                    self.viola_evaluation.in_path = self.in_path
                    self.viola_evaluation.out_path = self.out_path


        #Setup the sliding window
        elif self.method == 'slide':
            self.slider = SlidingWindowNeural(full_dataset=self.full_dataset, in_path=self.in_path, out_path=self.out_path, **detector_params) 
        else:
            raise ValueError('Need to specific either viola or sliding window')

        self.ensemble = ensemble

        #EVALUATION OBJECTS
        self.auc_thresholds = [.5]
        self.detections_after_neural = list()
        self.evaluation_after_neural = list()
        detector_names = detector_params['detector_names'] if self.method=='viola' else None
        for thres in self.auc_thresholds:
            detections = Detections()
            self.detections_after_neural.append(detections)
            self.evaluation_after_neural.append(Evaluation(detections=detections, 
                                        method='pipeline', save_imgs=True, out_path=self.out_path,
                                        auc_threshold=thres, folder_name=out_folder_name, in_path=self.in_path, detector_names=detector_names))

        self.auc = AucCurve(self.img_names, self.evaluation_after_neural[0].correct_roofs, self.out_path, self.method)
        if self.data_fold == utils.TESTING:
            self.classification = Classification(self.img_names, self.out_path, self.evaluation_after_neural[0].correct_roofs, self.method)
        print self.img_names

        self.neural_time = defaultdict(int)
        self.viola_time = defaultdict(int)
Exemplo n.º 32
0
 def __init__(self, O_O_distance=2.8, O_H_distance=1.0, intermediate_saves=[], folder="wales_21+", group_saves=[]):
     WaterAlgorithm()
     self.initialize(O_H_distance=O_H_distance, O_O_distance=O_O_distance, intermediate_saves=intermediate_saves, group_saves=group_saves, folder=folder, charge=1, do_symmetry_check=False, order=[3,14,9,2,10,19,16,15,7,18,4,0,17,1,12,6,5,8,20,11,13])
     self.N = 21
     self.classification = Classification(self) 
Exemplo n.º 33
0
if __name__ == "__main__":
    names = pd.read_table(
        "./in/AgProducts_Expanded_Names.tsv", encoding="utf-8", dtype={"code": str}
    )

    hierarchy = pd.read_table(
        "./in/AgProducts_Expanded_Hierarchy.tsv", encoding="utf-8"
    )
    hierarchy.columns = ["level3_code", "level2_code", "level1_code", "level0_code"]

    fields = {"level0": [], "level1": [], "level2": [], "level3": []}

    h = Hierarchy(["level0", "level1", "level2", "level3"])
    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
    parent_code_table.code = parent_code_table.code.astype(str)

    parent_code_table = parent_code_table.merge(names, on=["code", "level"])

    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)
    parent_id_table["name"] = parent_id_table.name_en

    parent_id_table = parent_id_table[
        ["code", "name", "level", "name_en", "name_es", "parent_id"]
    ]

    c = Classification(parent_id_table, h)
    c.table.code = c.table.code.str.lower()

    c.to_csv("out/agricultural_products_expanded.csv")
    c.to_stata("out/agricultural_products_expanded.dta")
Exemplo n.º 34
0
 def test_sigmoid_vector(self):
     small_sig = Classification._sigmoid(np.array([-10000, -10000, -10000]))
     large_sig = Classification._sigmoid(np.array([10000, 10000, 10000]))
     for s_val, l_val in zip(small_sig, large_sig):
         self.assertAlmostEqual(s_val, 0)
         self.assertAlmostEqual(l_val, 1)
Exemplo n.º 35
0
import os.path
from flask import Flask, request, redirect, url_for, jsonify
from classification import Classification

FLAGS = None
BOT = None

# Start web server
application = Flask(__name__)


@application.route('/chat', methods=['POST'])
def chat():
    text = request.get_data(as_text=True)
    result = BOT.handle(text)
    return jsonify(result)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--port',
                        type=int,
                        default=8080,
                        help='Port for http server to listen on.')
    FLAGS, unparsed = parser.parse_known_args()

    # Creates NLP chat bot.
    BOT = Classification()

    application.run(host='0.0.0.0', port=FLAGS.port)
Exemplo n.º 36
0
 def test_grad(self):
     # Unsure what the grad should come to. but I think a vector of length = to number of features (+1 for intercept)
     theta_z = pd.Series(np.zeros(len(self.X.columns)))
     clsfy = Classification(self.X, self.y)
     grad = clsfy._gradient(theta_z)
     print(grad)
Exemplo n.º 37
0
    df.name_english = df.name_english.str.replace(", $", "")

    h = Hierarchy(
        ["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"])

    df.loc[df.code.str.len() == 2, "level"] = "twodigit"
    df.loc[df.code.str.len() == 3, "level"] = "threedigit"
    df.loc[df.code.str.len() == 4, "level"] = "fourdigit"
    df.loc[df.code.str.len() == 5, "level"] = "fivedigit"
    df.loc[df.code.str.len() == 6, "level"] = "sixdigit"

    spanish = df[["code", "level", "name_spanish"]]
    spanish.columns = ["code", "level", "name_es"]

    # make sure this is the hand-fixed version
    assert df.loc[304, "code"] == '31'

    df = df[["code", "name_english", "level"]]
    df.columns = ["code", "name", "level"]

    parent_code_table = ordered_table_to_parent_code_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)

    parent_id_table = parent_id_table.merge(spanish, on=["level", "code"])

    c = Classification(parent_id_table, h)

    c.to_csv("out/industries_mexico_scian_2007.csv")
    c.to_stata("out/industries_mexico_scian_2007.dta")
    ids = os.listdir(datapath)
    for id in ids:
        idpath = os.path.join(datapath, id)
        idfaces = os.listdir(idpath)
        for id_train_face in idfaces:
            if id_train_face.split('.')[-1] != 'jpg':
                continue
            img = cv2.imread(os.path.join(idpath, id_train_face))
            img = cv2.resize(img, input_shape)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            X.append(img)
            y.append(ids.index(id))

    X = np.asarray(X).astype('float32')
    y = np.asarray(y)
    X /= 255
    return X, y


input_shape = (180, 180)
num_classes = 7
x_train, y_train = load_faces_data('/data_out/siamese_faces/train')
x_test, y_test = load_faces_data('/data_out/siamese_faces/test')

siam = Siamese(x_train, y_train, x_test, y_test, input_shape, num_classes)
siam.train(epochs=5)

classifier = Classification(x_train, y_train, x_test, y_test, input_shape,
                            num_classes)
classifier.train(epochs=5)
Exemplo n.º 39
0
class GridSearch(object):

    def __init__(self, dataset_lists, feature="edge_hist",
                 learning_model="dnn", score_type="all"):
        self.feature = feature
        self.learning_model = learning_model
        self.score_type = score_type
        self.best_accuracy = 0
        self.best_recall = 0
        self.best_params = {}
        self.best_confusion_matrix = 0
        self.clf = Classification(dataset_lists, feature, learning_model,
                                  output_every_scores=False)

    def run(self):
        print(f"{'='*30} grid search {'='*30}")
        print(f"{'='*10} feature: {self.feature}, "
              f"model: {self.learning_model}, "
              f"score_type: {self.score_type} {'='*10}")

        if self.learning_model == "dnn":
            self._dnn_grid_search()
        elif self.learning_model == "svm":
            self._svm_grid_search()
        self._output_best_scores()

    def _dnn_grid_search(self):
        for alpha in ALPHA:
            self.params = {
                "hidden_layer_sizes": HIDDEN_LAYER_SIZES,
                "alpha": alpha,
                "max_iter": MAX_ITER,
                "random_state": RANDOM_STATE,
            }
            self.grid_search()

    def _svm_grid_search(self):
        for c in PARAMS:
            for gamma in PARAMS:
                self.params = {
                    "C": c,
                    "gamma": gamma,
                    "decision_function_shape": DECISION_FUNCTION_SHAPE,
                    "random_state": RANDOM_STATE,
                }
                self.grid_search()

    def grid_search(self):
        self.accuracy, self.recall, self.confusion_matrix = \
                self.clf.train_and_test(**self.params)
        if self.score_type == "all":
            self._update_best_scores()
        elif self.score_type == "fake":
            self._update_best_fake_scores()

    # 正解率を重視
    def _update_best_scores(self):
        if self.accuracy > self.best_accuracy:
            self._update_scores()

    # 検出率を重視
    def _update_best_fake_scores(self):
        if self.recall > self.best_recall:
            self._update_scores()

    def _update_scores(self):
        self.best_accuracy = self.accuracy
        self.best_recall = self.recall
        self.best_params = self.params
        self.best_confusion_matrix = self.confusion_matrix

    def _output_best_scores(self):
        print(f"best params: {self.best_params}")
        print(f"best accuracy: {self.best_accuracy * 100} %")
        print(f"best recall: {self.best_recall * 100} %")
        print(self.best_confusion_matrix)
Exemplo n.º 40
0
from classification import (Hierarchy, ordered_table_to_parent_code_table,
                            parent_code_table_to_parent_id_table,
                            Classification)

if __name__ == "__main__":

    sinco = pd.read_csv("in/SINCO_2011.csv", header=None, encoding="latin-1")

    sinco.columns = ["data"]
    sinco = sinco[~sinco.data.str.startswith("INEGI.")]
    sinco = sinco[~sinco.data.str.startswith(u"Clave Descripción")]

    for index, row in reversed(list(sinco[~sinco.data.str.match("^\d* ")].iterrows())):
        sinco.ix[index - 1] += (" " + sinco.ix[index])

    sinco = sinco[sinco.data.str.match("^\d* ")]

    sinco = sinco.data.str.split(" ", 1).apply(pd.Series, 1)
    sinco.columns = ["code", "name"]

    sinco["level"] = sinco["code"].apply(lambda x: str(len(x)) + "digit")
    h = Hierarchy(["1digit", "2digit", "3digit", "4digit"])

    parent_code_table = ordered_table_to_parent_code_table(sinco, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/occupations_sinco_2011.csv")
    c.to_stata("out/occupations_sinco_2011.dta")
Exemplo n.º 41
0
    df.parent_id = df.parent_id.astype(float)

    h = Hierarchy(["country", "department", "msa", "municipality"])
    df.level = df.level.astype("category", categories=h, ordered=True)
    df.level = df.level.astype(str)

    # Drop old Callao department and province
    # Do this after reset_index to not mess up the id order
    df = df[df.code != "070000"]
    df = df[df.code != "070100"]

    # Order the columns
    df = df[
        [
            "code",
            "name",
            "level",
            "name_es",
            "name_en",
            "name_short_es",
            "name_short_en",
            "parent_id",
        ]
    ]

    c = Classification(df, h)

    c.to_csv("out/locations_peru_datlas.csv")
    c.to_stata("out/locations_peru_datlas.dta")
Exemplo n.º 42
0
    df = pd.read_table("in/DIVIPOLA_20150331.txt", encoding="utf-16")
    df.columns = ["department_code", "municipality_code",
                  "population_center_code", "department_name",
                  "municipality_name", "population_center_name",
                  "population_center_type", "longitude", "", "latitude",
                  "district", "municipality_type", "metro_area"]

    df = df[["department_code", "department_name", "municipality_code",
             "municipality_name", "population_center_code",
             "population_center_name"]]


    df.department_code = df.department_code.astype(str).str.zfill(2)
    df.municipality_code = df.municipality_code.astype(str).str.zfill(5)
    df.population_center_code = df.population_center_code.astype(str).str.zfill(8)

    df.department_name = df.department_name.str.title()
    df.municipality_name = df.municipality_name.str.title()
    df.population_center_name = df.population_center_name.str.title()

    h = Hierarchy(["department", "municipality", "population_center"])

    parent_code_table = repeated_table_to_parent_id_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_colombia_dane.csv")
    c.to_stata("out/locations_colombia_dane.dta")
Exemplo n.º 43
0
    hierarchy.columns = ["level2_code", "level1_code", "level0_code"]

    fields = {"level0": [], "level1": [], "level2": []}

    h = Hierarchy(["level0", "level1", "level2"])
    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
    parent_code_table.code = parent_code_table.code.astype(str)

    parent_code_table = parent_code_table.merge(names, on=["code", "level"])

    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)
    parent_id_table["name"] = parent_id_table.name_en

    parent_id_table = parent_id_table[
        [
            "code",
            "name",
            "level",
            "name_en",
            "name_es",
            "name_short_en",
            "name_short_es",
            "parent_id",
        ]
    ]

    c = Classification(parent_id_table, h)

    c.to_csv("out/land_use.csv")
    # c.to_stata("out/land_use.dta")
Exemplo n.º 44
0
    # Replace trailing comma and space
    df.name_spanish = df.name_spanish.str.replace(", $", "")
    df.name_english = df.name_english.str.replace(", $", "")

    h = Hierarchy(["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"])

    df.loc[df.code.str.len() == 2, "level"] = "twodigit"
    df.loc[df.code.str.len() == 3, "level"] = "threedigit"
    df.loc[df.code.str.len() == 4, "level"] = "fourdigit"
    df.loc[df.code.str.len() == 5, "level"] = "fivedigit"
    df.loc[df.code.str.len() == 6, "level"] = "sixdigit"

    spanish = df[["code", "level", "name_spanish"]]
    spanish.columns = ["code", "level", "name_es"]

    # make sure this is the hand-fixed version
    assert df.loc[304, "code"] == "31"

    df = df[["code", "name_english", "level"]]
    df.columns = ["code", "name", "level"]

    parent_code_table = ordered_table_to_parent_code_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    parent_id_table = parent_id_table.merge(spanish, on=["level", "code"])

    c = Classification(parent_id_table, h)

    c.to_csv("out/industries_mexico_scian_2007.csv")
    c.to_stata("out/industries_mexico_scian_2007.dta")
Exemplo n.º 45
0
'''
predict.py有几个注意点
1、无法进行批量预测,如果想要批量预测,可以利用os.listdir()遍历文件夹,利用Image.open打开图片文件进行预测。
2、如果想要将预测结果保存成txt,可以利用open打开txt文件,使用write方法写入txt,可以参考一下txt_annotation.py文件。
'''
from PIL import Image

from classification import Classification

classfication = Classification()

while True:
    img = input('Input image filename:')
    try:
        image = Image.open(img)
    except:
        print('Open Error! Try again!')
        continue
    else:
        class_name = classfication.detect_image(image)
        print(class_name)
Exemplo n.º 46
0
def main():
    parser = OptionParser()
    parser.add_option("--stemmer-language", dest="stemmer_language", help="Language for SnowballStemmer", default="english")
    parser.add_option('-i', action="store_true", dest="ignore_stopwords_stemmer", help="Ignore stopwords in stemmer, default false", default=False)
    parser.add_option("--stopwords-language", dest="stopwords_language", help="Language for stopwords")
    parser.add_option("-k", action="store_true", dest="keep_stopwords", help="Keep stopwords, default remove", default=False)
    parser.add_option('--load-classifier', dest="load_classifier_file_path", help="Specify load classifiers file")
    parser.add_option('--create-classifier', dest="create_classifier", help="File for training set")
    parser.add_option('--row-training-set', dest="row_training_set", help="Number of row for training set", default=1000)
    parser.add_option('-r', action="store_true", dest="random_row_training_set", help="Get random row from training set file", default=False)
    parser.add_option('--text-field', dest="text_field", help="text field in json file", default="text")
    parser.add_option('--word-tokenize-language', dest="word_tokenize_language", help="Word tokenize language", default="english")
    parser.add_option('--classification-field', dest="classification_field", help="Classification field in json data", default="category")
    parser.add_option('--dump-classifier', dest="dump_classifier", help="Dump classifier file", default=False)
    parser.add_option('-a', action="store_true", dest="calculate_accuracy", help="Calculate accuracy", default=False)
    parser.add_option('--test-file-path', dest="test_file_path", help="Test file path")
    parser.add_option('--row-test-set', dest="row_test_set", help="Number of row for test set", default=500)
    parser.add_option('--random-row-test-set', action="store_true", dest="random_row_test_set", help="Get random row from test set file", default=False)
    parser.add_option('--test-text-field', dest="test_text_field", help="text field in json test file", default="text")
    parser.add_option('--test-classification-field', dest="test_classification_field", help="classificaion field in json test file", default="category")
    parser.add_option('--classify', dest="classify_text", help="classify text", default=False)
    (options, args) = parser.parse_args(sys.argv)

    cl = Classification(
        stemmer_language=options.stemmer_language,
        stopwords_language=options.stopwords_language,
        ignore_stopwords_stemmer=options.ignore_stopwords_stemmer,
    )

    if options.load_classifier_file_path:
        cl.load_classifier(load_classifier_file_path=options.load_classifier_file_path)
    elif options.create_classifier:
        cl.create_and_train_classifier(
            training_file_path=options.create_classifier,
            keep_stopwords=options.keep_stopwords,
            row_training_set=options.row_training_set,
            random_row_training_set=options.random_row_training_set,
            text_field=options.text_field,
            word_tokenize_language=options.word_tokenize_language,
            classification_field=options.classification_field
        )

    if options.dump_classifier:
        cl.dump_classifier(options.dump_classifier)
    if options.calculate_accuracy:
        cl.accuracy(
            test_file_path=options.test_file_path,
            keep_stopwords=options.keep_stopwords,
            row_test_set=options.row_test_set,
            random_row_test_set=options.random_row_test_set,
            text_field=options.test_text_field,
            word_tokenize_language=options.word_tokenize_language,
            classification_field=options.test_classification_field
        )

    if(options.classify_text):
        cl.classify(
            text=options.classify_text,
            keep_stopwords=options.keep_stopwords,
            word_tokenize_language=options.word_tokenize_language
        )
Exemplo n.º 47
0
    trans = trans.apply(fill_code, axis=1)

    # Prospedia specific
    trans = trans[trans.level != "section"]
    df = pd.read_table("./in/prospedia_hs_structure.txt")
    df.columns = ["4digit_code", "2digit_code", "prospedia_section_code"]
    df["4digit_code"] = df["4digit_code"].astype(str).str.zfill(4)
    df["4digit_name"] = None
    df["2digit_code"] = df["2digit_code"].astype(str).str.zfill(2)
    df["2digit_name"] = None
    df["prospedia_section_name"] = None
    df["prospedia_section_code"] = df["prospedia_section_code"].astype(str).str.zfill(1)

    h = Hierarchy(["prospedia_section", "2digit", "4digit"])

    parent_code_table = repeated_table_to_parent_id_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    parent_id_table = parent_id_table.merge(trans, on=["level", "code"])
    parent_id_table.name = parent_id_table.name_en

    assert parent_id_table.name.isnull().sum() == 3
    parent_id_table.loc[parent_id_table.name.isnull(), "name"] = u"No name"
    assert parent_id_table.name.isnull().sum() == 0

    c = Classification(parent_id_table, h)

    c.to_csv("out/products_mexico_prospedia.csv")
    c.to_stata("out/products_mexico_prospedia.dta")