print 'train lex raw ', len(train_lex_raw) print 'train y raw', len(train_y_raw) print train_lex_raw[1] print train_y_raw[1] test_toks_raw = [] test_lex_raw = [] test_y_raw = [] with open('./dataset/DDI13/DDI13_test_processed_drugbank.csv', 'rU') as f: rd = csv.DictReader(f) for row in rd: test_toks_raw.append(parselist(row['lower_tokens'])) test_lex_raw.append([row['lower_text']]) test_y_raw.append(parselist(row['label'])) print 'test lex raw ', len(test_lex_raw) # Convert each sentence of normalized tokens and labels into arrays of indices train_lex = vectorize(train_toks_raw, tok2idx) train_y = vectorize(train_y_raw, labels2idx) valid_lex = vectorize(valid_toks_raw, tok2idx) valid_y = vectorize(valid_y_raw, labels2idx) test_lex = vectorize(test_toks_raw, tok2idx) test_y = vectorize(test_y_raw, labels2idx) print 'test lex ', len(test_lex) # # Pickle the resulting data set # with open('./dataset/DDI13_processed_10fold_testOnDrugbank.pkl','w') as fout: # pkl.dump([[train_toks_raw,train_lex,train_y],[valid_toks_raw,valid_lex,valid_y],[test_toks_raw,test_lex,test_y], # {'labels2idx':labels2idx, 'words2idx':tok2idx}], fout)
instances, classes, tags_cnt = build_data_set(user_id, FEATURES_LEVEL == 0) print "Set built, %d(%d+%d) (ratio: %.2f)" % ( len(classes), classes.count( Classes.INTERESTED), classes.count(Classes.UNKNOWN), ML_DATA_RATIO) if FEATURES_LEVEL == 0: stackoverflow_features = [] tot_features = sum([tags_cnt[k] for k in tags_cnt]) for k in tags_cnt: if 1. * tags_cnt[k] / tot_features >= 0.002: stackoverflow_features.append(k) print "StackOverflow features:", stackoverflow_features mapping, X = vectorize(instances) if FEATURES_LEVEL == 0: stackoverflow_tags = np.zeros(len(mapping)) for k in stackoverflow_features: stackoverflow_tags[mapping[k]] = 1. y = np.array(classes) kf = KFold(len(classes), k=4) if FEATURES_LEVEL > 0: classifiers = { 'knn-15': KNeighborsClassifier(15, weights='distance'), 'svc': SVC(C=1.0,
print "Set built, %d(%d+%d) (ratio: %.2f)" % (len(classes), classes.count(Classes.INTERESTED), classes.count(Classes.UNKNOWN), ML_DATA_RATIO) if FEATURES_LEVEL == 0: stackoverflow_features = [] tot_features = sum([tags_cnt[k] for k in tags_cnt]) for k in tags_cnt: if 1.*tags_cnt[k]/tot_features >= 0.002: stackoverflow_features.append(k) print "StackOverflow features:", stackoverflow_features mapping, X = vectorize(instances) if FEATURES_LEVEL == 0: stackoverflow_tags = np.zeros(len(mapping)) for k in stackoverflow_features: stackoverflow_tags[mapping[k]] = 1. y = np.array(classes) kf = KFold(len(classes), k=4) if FEATURES_LEVEL > 0: classifiers = { 'knn-15': KNeighborsClassifier(15, weights='distance'), 'svc': SVC(C=1.0, coef0=0.0, degree=3, gamma=0.5, kernel='rbf', probability=False, shrinking=True, tol=0.001), 'tree': DecisionTreeClassifier(max_depth=10),
for k in range(markets): for l in range(m): for i in range(T): dataset[l][k].append(float(data[k][l+i])) ########################################################### ###Set of Correlation Matrices as SET of Vectors SET = [] D = markets*(markets-1)/2 for I in xrange(m): matrix = [] matrix = dataset[I] V = [] V = vectorize(correlation(matrix)) SET.append(V) ASET = np.asarray(SET) ########################################################### ########################################################### ### RESULTADOS (Seccion PARALELIZABLE!) ########################################################### ########################################################### ###Parametros Globales threshold = 0.1 lim = 20 nn = 40 print "RESULTADOS..."