예제 #1
0
def rake(filePath):
    '''
		Main function of our project
		params:
			filePath | string     : Path of file which we have to read.
		
		return:
			keywordsList | string : List of index keywords. 
	'''
    rawText = readFile.readFile(filePath)
    preObj = preprocessing.Preprocess()
    candidateKeywordList = preObj.preprocess(rawText)
    indexKeywordList = postprocessing.postprocess(candidateKeywordList)
    return indexKeywordList
예제 #2
0
import preprocessing

from A1 import model_svm_a1 as svm_a1
from A2 import model_svm_a2 as svm_a2
from B1 import model_cnn_b1 as cnn_b1
from B2 import model_cnn_b2 as cnn_b2

# ======================================================================================================================
# Data preprocessing
pre = preprocessing.Preprocess()
# Preprocess celeba dataset, 68 landmark features extraction
img_train_gender, label_train_gender, img_train_emo, label_train_emo, img_val_gender, label_val_gender, img_val_emo, label_val_emo = pre.preprocess_celeba(
    False)

# Preprocess cartoon dataset, image preprocess
train_imgs_cartoon, train_labels_cartoon, val_imgs_cartoon, val_labels_cartoon = pre.preprocess_cartoon(
    False)

# Additional test dataset
img_test_gender, label_test_gender, img_test_emo, label_test_emo = pre.preprocess_celeba(
    True)
test_imgs_cartoon, test_labels_cartoon = pre.preprocess_cartoon(True)
# ======================================================================================================================
# Task A1
model_A1 = svm_a1.Utils_A1()
acc_A1_train, clf_gender = model_A1.train(img_train_gender, label_train_gender)
acc_A1_val = model_A1.test(clf_gender, img_val_gender, label_val_gender)

#Additional test dataset
acc_A1_test = model_A1.test(clf_gender, img_test_gender, label_test_gender)
#
예제 #3
0
    data.to_csv('variants_encoded_only_VUS.csv', index=False)
    return data


#Write csv from pandas dataframe without encoding
def write_not_encoded(data):
    # data.to_csv('variants_not_encoded_only_VUS.csv', index=False)
    data.to_csv('variants_not_encoded.csv', index=False)


#Data Insertion
pp = preprocessing.Preprocess(
    data_path_c='dict2csv/variants.csv',
    read_dtype={
        'motif.ehipos': object,
        'motif.ename': object,
        'cadd.istv': object
    },
    autoEliminateNullColumns=False,
    autoImpute=False)  #https://stackoverflow.com/a/27232309/8149411
data = pp.getData()
data = data.astype({
    'motif.ehipos': np.bool,
    'motif.ename': np.bool,
    'cadd.istv': str
})

#Drop id and url columns
pp.dropCols(
    ['_id', 'cadd._license', 'clinvar._license', 'clinvar.rsid', '_score'])
data = pp.getData()
예제 #4
0
파일: main.py 프로젝트: yoonjong12/NDSL-API
import os
import preprocessing
# Konlpy tokenizer
from konlpy.tag import Okt, Komoran, Kkma

okt = Okt()
komoran = Komoran()
kkma = Kkma()

# ex) 'keyValue=12341234' 부여받은 숫자로 교체하세요
ndsl = NDSL('keyValue=12341234')
# ex) input: '드론 증강현실 드론 자율주행' -> query = ['드론','증강현실','드론','자율주행']
query = input().split()
load = 'patent_'

#특허 받아오기
ndsl.getPatent(query, load)

#특허 정리
etc.sortPatent([query[0]], load, 'temp')

#특허 초록만 정리
try:
    os.mkdir('abstract')
except:
    pass
etc.getAb(query[0], load, 'abstract/patent_')

# 토크나이징
pre = preprocessing.Preprocess(okt)
pre.getToken([query[0]], 'abstract/patent_', 'abstract/patent_')
# -*- coding: utf-8 -*-
import preprocessing
from sklearn.linear_model import LogisticRegression

pp = preprocessing.Preprocess('veriler.csv')
pp.dropCols(['ulke'])
pp.dropRows([0,1,2,3,4])
pp.scale([0,1,2])
#pp.print()
x_train, x_test, y_train, y_test = pp.trainTestSplitting(['cinsiyet'])

log_r = LogisticRegression(solver='lbfgs',random_state=0)
log_r.fit(x_train, y_train.values.ravel())

y_pred = log_r.predict(x_test)
print(y_pred) 
print(y_test.values.ravel())

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
print(cm)
예제 #6
0
# -*- coding: utf-8 -*-
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.metrics import confusion_matrix
import preprocessing

pp = preprocessing.Preprocess('src/veriler.csv')
pp.dropCols(['ulke'])
#pp.encode()
data = pp.getData()

x_train, x_test, y_train, y_test = pp.trainTestSplitting(['cinsiyet'])
y_train = y_train.values.ravel()

gnb = GaussianNB()  #continous dağılımlı veriler içeriyorsa
gnb.fit(x_train, y_train)
y_pred_gnb = gnb.predict(x_test)
cm_gnb = confusion_matrix(y_test, y_pred_gnb)
print('GaussianNB:\n', cm_gnb)

mnb = MultinomialNB()  #nominal yani kesikli dağılım içeriyorsa
mnb.fit(x_train, y_train)
y_pred_mnb = mnb.predict(x_test)
cm_mnb = confusion_matrix(y_test, y_pred_mnb)
print('MultiominalNB:\n', cm_mnb)

compnb = ComplementNB(
)  #Hedef sınıf nominal veriler içeriyorsa ve dengesizse (imbalanced) Özellikle text classificationda Complement > Multinominal
compnb.fit(x_train, y_train)
y_pred_compnb = compnb.predict(x_test)
cm_compnb = confusion_matrix(y_test, y_pred_compnb)
print('ComplementNB:\n', cm_compnb)