示例#1
0
print 'train lex raw ', len(train_lex_raw)
print 'train y raw', len(train_y_raw)
print train_lex_raw[1]
print train_y_raw[1]

test_toks_raw = []
test_lex_raw = []
test_y_raw = []
with open('./dataset/DDI13/DDI13_test_processed_drugbank.csv', 'rU') as f:
    rd = csv.DictReader(f)
    for row in rd:
        test_toks_raw.append(parselist(row['lower_tokens']))
        test_lex_raw.append([row['lower_text']])
        test_y_raw.append(parselist(row['label']))
print 'test lex raw ', len(test_lex_raw)

# Convert each sentence of normalized tokens and labels into arrays of indices
train_lex = vectorize(train_toks_raw, tok2idx)
train_y = vectorize(train_y_raw, labels2idx)
valid_lex = vectorize(valid_toks_raw, tok2idx)
valid_y = vectorize(valid_y_raw, labels2idx)
test_lex = vectorize(test_toks_raw, tok2idx)
test_y = vectorize(test_y_raw, labels2idx)
print 'test lex ', len(test_lex)

# # Pickle the resulting data set
# with open('./dataset/DDI13_processed_10fold_testOnDrugbank.pkl','w') as fout:
#     pkl.dump([[train_toks_raw,train_lex,train_y],[valid_toks_raw,valid_lex,valid_y],[test_toks_raw,test_lex,test_y],
#               {'labels2idx':labels2idx, 'words2idx':tok2idx}], fout)
示例#2
0
instances, classes, tags_cnt = build_data_set(user_id, FEATURES_LEVEL == 0)

print "Set built, %d(%d+%d) (ratio: %.2f)" % (
    len(classes), classes.count(
        Classes.INTERESTED), classes.count(Classes.UNKNOWN), ML_DATA_RATIO)

if FEATURES_LEVEL == 0:
    stackoverflow_features = []
    tot_features = sum([tags_cnt[k] for k in tags_cnt])
    for k in tags_cnt:
        if 1. * tags_cnt[k] / tot_features >= 0.002:
            stackoverflow_features.append(k)
    print "StackOverflow features:", stackoverflow_features

mapping, X = vectorize(instances)

if FEATURES_LEVEL == 0:
    stackoverflow_tags = np.zeros(len(mapping))
    for k in stackoverflow_features:
        stackoverflow_tags[mapping[k]] = 1.

y = np.array(classes)
kf = KFold(len(classes), k=4)

if FEATURES_LEVEL > 0:
    classifiers = {
        'knn-15':
        KNeighborsClassifier(15, weights='distance'),
        'svc':
        SVC(C=1.0,
print "Set built, %d(%d+%d) (ratio: %.2f)" % (len(classes),
                                              classes.count(Classes.INTERESTED),
                                              classes.count(Classes.UNKNOWN),
                                              ML_DATA_RATIO)

if FEATURES_LEVEL == 0:
    stackoverflow_features = []
    tot_features = sum([tags_cnt[k] for k in tags_cnt])
    for k in tags_cnt:
        if 1.*tags_cnt[k]/tot_features >= 0.002:
            stackoverflow_features.append(k)
    print "StackOverflow features:", stackoverflow_features


mapping, X = vectorize(instances)

if FEATURES_LEVEL == 0:
    stackoverflow_tags = np.zeros(len(mapping))
    for k in stackoverflow_features:
        stackoverflow_tags[mapping[k]] = 1.

y = np.array(classes)
kf = KFold(len(classes), k=4)

if FEATURES_LEVEL > 0:
    classifiers = {
        'knn-15':   KNeighborsClassifier(15, weights='distance'),
        'svc':  SVC(C=1.0, coef0=0.0, degree=3, gamma=0.5, kernel='rbf', probability=False,
                     shrinking=True, tol=0.001),
        'tree': DecisionTreeClassifier(max_depth=10),
示例#4
0
for k in range(markets):
    for l in range(m):
	for i in range(T):
	    dataset[l][k].append(float(data[k][l+i]))    
	    

###########################################################
###Set of Correlation Matrices as SET of Vectors

SET = []
D = markets*(markets-1)/2
for I in xrange(m):
    matrix = []
    matrix  = dataset[I]
    V = []
    V = vectorize(correlation(matrix))
    SET.append(V)
ASET = np.asarray(SET)



###########################################################
###########################################################
### RESULTADOS (Seccion PARALELIZABLE!)
###########################################################
###########################################################
###Parametros Globales
threshold = 0.1
lim = 20
nn = 40
print "RESULTADOS..."