def run(): # If prepared file does not exist yet, create it if not os.path.isfile(FREESURFER_FILE_PREP): # Load FreeSurfer features features = util.load_features(FREESURFER_FILE) # Perform age-matching features = prepare.match_ages(features, 'HC', 'SZ', age_diff=2) # Remove constant features features = prepare.remove_constant_features(features) # Normalize numerical features across subjects (excluding 'age') features = prepare.normalize_across_subjects(features, exclude=['age']) # Residualize features for age, gender and total intracranial volume features = prepare.residualize(features, ['age', 'gender', 'EstimatedTotalIntraCranialVol']) # Remove highly correlated features features = prepare.remove_correlated_features(features) # Remove certain columns features = prepare.remove_features(features, ['diagnosis', 'age', 'gender']) # Write prepared freesurfer features back to file util.save_features(FREESURFER_FILE_PREP, features) else: # Load prepared features features = util.load_features(FREESURFER_FILE_PREP) # Run DBSCAN on features dbscan = DBSCAN(epsilon=1.0, min_pts=5) dbscan.run(features)
def get_all_combinations_of_variables(category): # List for all the variables involved # random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Dictionary to store the variable values dict_variables = {} dict_values = {} features = load_features(category) # #Writing to disk # if category == "cursive": # pickle.dump(features, open("../total_features_cursive.p", "wb")) # elif category == "printed": # pickle.dump(features, open("../total_features_printed.p", "wb")) # # print len(features) # Generating all combinations of length 2 for random_var in range(0, len(random_variables) + 1): for variable_pair in itertools.combinations(random_variables, random_var): if len(variable_pair) == 2: dict_values = get_dict_values(features, variable_pair, category) dict_variables[variable_pair] = dict_values # print(variable_pair) return dict_variables
def translate_vcf(vcf_fname, reference, path, feature_names=None): import time start = time.time() try: vcf_dict = read_in_vcf(vcf_fname, ref_fasta(path)) except: print "Loading input alignment failed!: {}".format(vcf_fname) end = time.time() print "Reading in VCF took {}".format(str(end-start)) start = time.time() featN = np.array(feature_names) selected_features = load_features(reference, feature_names) print "Translating {} genes...".format(len(selected_features)) end = time.time() print "Reading in genes took {}".format(str(end-start)) ref = vcf_dict['reference'] sequences = vcf_dict['sequences'] prots = {} deleted = [] notMult3 = [] start = time.time() #if genes have no mutations across sequences, they are dropped here from further analysis #check that gene lengths are multiples of 3. The first occurance causes an error in #Biopython, but subsequent ones do not - so make it ourselves. for fname,feature in selected_features.items(): if len(str(feature.extract( SeqRecord(seq=Seq(ref)) ).seq))%3 != 0: notMult3.append(fname) prot_dict = translate_vcf_feature(sequences, ref, feature) if prot_dict is not None: prots[fname] = prot_dict else: deleted.append(fname) end = time.time() print "Translations took {}".format(str(end-start)) start = time.time() #print out VCF of proteins #in new augur, set compress depending on input file ending! write_VCF_translation(prots, translation_vcf_file(path), translation_ref_file(path), compress=False) end = time.time() print "Writing out VCF took {}".format(str(end-start)) #warn of those that don't have a length mult of 3 print "WARNING: These genes do not have lengths that are a multiple of 3!\n{}".format(str(notMult3)) #print dropped genes to a text file if len(deleted) != 0: with open(dropped_genes(path), 'w') as the_file: for d in deleted: the_file.write(d+"\n") print "{} genes had no mutations and so will be excluded. Excluded genes can be found in {}".format(len(deleted), dropped_genes(path)) return prots
def run_model(train_data, test_data, n_trees, submit_id, model, save_model=False): ''' Trains a model of the specified type and size on the training data, then predicts on the test data and writes out a submission. Args: train_data - bare training feature set name without path or extension test_data - bare test feature set name without path or extension n_trees - number of trees to use in model submit_id - the result is written as submissions/submission_<submit_id>.csv model - a string...either 'rf' or 'extra' save_model - default False. If true, use joblib to dump the model at: paths.MODELS/<submit_id>_model.job Writes: A submission at paths.SUBMIT/submisssion_<submit_id>.csv ''' start = datetime.now() train = util.load_features(train_data) drops = util.get_drop_cols(train) train.drop(drops, axis=1, inplace=True) print 'training set size: (%d, %d)' % train.shape print 'Training...' if model == 'rf': model = train_rf(train, n_trees) else: model = train_extra_trees(train, n_trees) if save_model: model_path = os.path.join(paths.MODELS, submit_id + '_model.job') joblib.dump(model, model_path) del train print 'Predicting...' test = util.load_features(test_data) test.drop(drops, axis=1, inplace=True) print 'test set size: (%d, %d)' % test.shape result = predict(model, test) submission_name = 'submission_%s.csv' % str(submit_id) submission = os.path.join(paths.SUBMIT, submission_name) result.to_csv(submission, index=False) finish = datetime.now() print 'Run finished: %d sec.' % (finish - start).seconds
def main(): dataset = None if len(sys.argv) > 1: dataset = sys.argv[1] metadata = util.get_metadata((dataset + "_metadata") if dataset else None) mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None))) # Load pyAudioAnalysis features with open("F", "rb") as f: feats, files = pickle.load(f, encoding="latin1") files = [f.split(".")[0].split("XC")[-1] for f in files] F = dict(zip(files, feats)) full_dataset = True for item in metadata[1:]: if item[0] not in F: full_dataset = False X2, X3 = [], [] if full_dataset: X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]] X2 = [F[item[0]] for item in metadata[1:]] X1 = [mfcc[item[0]] for item in metadata[1:]] for X in [X1, X2]: NUM_RUNS = 50 Y = util.load_labels((dataset + "_metadata") if dataset else None) samples = range(len(X))#range(1, len(X), 12)#random.sample(range(len(X)), 25) samps = samples#range(len(X))#samples x = [X[i] for i in samps] y = [Y[i] for i in samples] N_ESTIMATORS = 20 avg_mat = None for run in range(NUM_RUNS): clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_features=20, oob_score=True).fit(X, Y) similarity = dict() for dt in clf.estimators_: leaves = dt.apply(X) for i in samps: for j in samps: if leaves[i] == leaves[j]: similarity[(i,j)] = similarity.get((i,j), 0) + 1 mat = np.array([[(1.0 - similarity.get((i,j), 0)/N_ESTIMATORS)**2 for j in samples] for i in samples]) mat = squareform(mat) if avg_mat is None: avg_mat = mat else: avg_mat = np.add(avg_mat, mat) avg_mat = avg_mat / NUM_RUNS linkage_matrix = linkage(avg_mat, "single") matplotlib.rcParams['lines.linewidth'] = 2.5 dendrogram(linkage_matrix, color_threshold=0.8, labels=y, show_leaf_counts=True) plt.xlabel("label") plt.ylabel("distance") plt.show()
def run(): for center in CONFIG.keys(): # Load meta data and give it a new index based on measurement and subject ID meta_data_file_path = CONFIG[center]['meta'] meta_data = pd.read_excel(meta_data_file_path, header=3) index = [] for i in range(len(meta_data.index)): mid = meta_data.iloc[i][meta_data.columns[0]] sid = meta_data.iloc[i][meta_data.columns[1]] index.append('{}_{}_sMRI'.format(sid, mid)) meta_data['id'] = pd.Series(index) meta_data.set_index('id', drop=True, inplace=True) # Load feature data features_file_path = CONFIG[center]['features'] features = util.load_features(features_file_path, index_col='MRid') try: # Select rows in meta data corresponding to subject IDs in feature data. # Currently, there seems to be something wrong with the CIMH data, that # is, there's no overlap in subject IDs at all... # TODO: Wait for Emanuel to explain meta_data = meta_data.loc[features.index] except KeyError as e: print( 'Subject IDs feature data do not match meta data {}'.format(e)) continue meta_data = meta_data[meta_data['Gender [m/f]'].notnull()] # Convert gender values to standardized format for idx in meta_data.index: gender = meta_data.loc[idx]['Gender [m/f]'] meta_data.set_value(idx, 'Gender [m/f]', to_gender(gender)) # Add columns to original feature data features['Center'] = center features['Age'] = meta_data['Age [years]'] features['Gender'] = meta_data['Gender [m/f]'] features['Diagnosis'] = meta_data['Diagnosis'] CONFIG[center]['features_ext'] = features # Concatenate feature data sets features = pd.concat([ CONFIG['CIMH']['features_ext'], CONFIG['UIO']['features_ext'], CONFIG['UNIBA']['features_ext'], CONFIG['UNICH']['features_ext'], ]) # Save concatenated feature data back to CSV file util.save_features(OUTPUT_FILE, features, index_label='MRid')
def translate(aln_fname, reference, feature_names, name_func): try: aln = AlignIO.read(aln_fname, 'fasta') except: print("Loading input alignment failed!:", aln_fname) selected_features = load_features(reference, feature_names) for fname, feature in selected_features.items(): translation = translate_feature(aln, feature) AlignIO.write(translation, name_func(fname), 'fasta')
def run(): for center in CONFIG.keys(): # Load meta data and give it a new index based on measurement and subject ID meta_data_file_path = CONFIG[center]['meta'] meta_data = pd.read_excel(meta_data_file_path, header=3) index = [] for i in range(len(meta_data.index)): mid = meta_data.iloc[i][meta_data.columns[0]] sid = meta_data.iloc[i][meta_data.columns[1]] index.append('{}_{}_sMRI'.format(sid, mid)) meta_data['id'] = pd.Series(index) meta_data.set_index('id', drop=True, inplace=True) # Load feature data features_file_path = CONFIG[center]['features'] features = util.load_features(features_file_path, index_col='MRid') try: # Select rows in meta data corresponding to subject IDs in feature data. # Currently, there seems to be something wrong with the CIMH data, that # is, there's no overlap in subject IDs at all... # TODO: Wait for Emanuel to explain meta_data = meta_data.loc[features.index] except KeyError as e: print('Subject IDs feature data do not match meta data {}'.format(e)) continue meta_data = meta_data[meta_data['Gender [m/f]'].notnull()] # Convert gender values to standardized format for idx in meta_data.index: gender = meta_data.loc[idx]['Gender [m/f]'] meta_data.set_value(idx, 'Gender [m/f]', to_gender(gender)) # Add columns to original feature data features['Center'] = center features['Age'] = meta_data['Age [years]'] features['Gender'] = meta_data['Gender [m/f]'] features['Diagnosis'] = meta_data['Diagnosis'] CONFIG[center]['features_ext'] = features # Concatenate feature data sets features = pd.concat([ CONFIG['CIMH']['features_ext'], CONFIG['UIO']['features_ext'], CONFIG['UNIBA']['features_ext'], CONFIG['UNICH']['features_ext'], ]) # Save concatenated feature data back to CSV file util.save_features(OUTPUT_FILE, features, index_label='MRid')
def export_diversity(path, prefix, reference, indent=None): ''' write the alignment entropy of each alignment (nucleotide and translations) to file ''' genes = load_features(reference) entropy_json = {} for feat, aln_fname in get_genes_and_alignments(path, tree=False): entropy = diversity_statistics(aln_fname, nuc=feat == 'nuc') S = [max(0, round(x, 4)) for x in entropy] n = len(S) if feat == 'nuc': entropy_json[feat] = { 'pos': range(0, n), 'codon': [x // 3 for x in range(0, n)], 'val': S } elif feat in genes: entropy_json[feat] = { 'pos': [x for x in genes[feat]][::3], 'codon': range(n), 'val': S } write_json(entropy_json, diversity_json(path, prefix), indent=indent)
def main(): dataset = None if len(sys.argv) > 1: dataset = sys.argv[1] metadata = util.get_metadata((dataset + "_metadata") if dataset else None) mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None))) feats, files = None,None with open("F", "rb") as f: feats, files = pickle.load(f, encoding="latin1") files = [f.split(".")[0].split("XC")[-1] for f in files] F = dict(zip(files, feats)) full_dataset = True for item in metadata[1:]: if item[0] not in F: full_dataset = False X2, X3 = [], [] if full_dataset: X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]] X2 = [F[item[0]] for item in metadata[1:]] X1 = [mfcc[item[0]] for item in metadata[1:]] Y = util.load_labels((dataset + "_metadata") if dataset else None)#"bbsmd.csv") for X in [X1, X2] if full_dataset else [X1,]: print("------") classifiers = [ RandomForestClassifier(n_estimators=50, max_features=15, oob_score=True), KNeighborsClassifier(3), svm.SVC(kernel='linear', C=1), svm.SVC(gamma=2, C=1), GaussianNB() ] for clf in classifiers: scores = cross_val_score(clf, X, Y, cv=5) score = sum(scores)/len(scores) print(type(clf).__name__, "\t", score)
except: print("Loading input alignment failed!:", aln_fname) selected_features = load_features(reference, feature_names) for fname, feature in selected_features.items(): translation = translate_feature(aln, feature) AlignIO.write(translation, name_func(fname), 'fasta') if __name__ == '__main__': parser = generic_argparse("Translate the nucleotide alignments") parser.add_argument('--reference', required=True, help='genbank file containing the annotation') parser.add_argument('--genes', nargs='+', help="genes to translate") args = parser.parse_args() path = args.path if not args.genes: genes = load_features(args.reference).keys() else: genes = args.genes for func in [tree_sequence_alignment, ref_alignment]: aln_fname = func(path, 'nuc') if os.path.isfile(aln_fname): translate(aln_fname, args.reference, genes, lambda x: func(path, x))
def export_metadata_json(T, path, prefix, reference, isvcf=False, indent=1): print("Writing out metaprocess") mjson = {} mjson["virus_count"] = T.count_terminals() from datetime import date mjson["updated"] = date.today().strftime('%Y-%m-%d') mjson["author_info"] = { "?": { "paper_url": "?", "journal": "?", "title": "?", "n": 1 } } mjson["seq_author_map"] = {} from collections import defaultdict cmaps = defaultdict(list) with open(color_maps(path), 'r') as cfile: for line in cfile: try: trait, name, color = line.strip().split('\t') except: continue cmaps[trait].append((name, color)) #if drug-resistance colours have been auto-generated, get these too import os.path if os.path.isfile(drm_color_maps(path)): with open(drm_color_maps(path), 'r') as cfile: for line in cfile: try: trait, name, color = line.strip().split('\t') except: continue cmaps[trait].append((name, color)) mjson["color_options"] = { "gt": { "menuItem": "genotype", "type": "discrete", "legendTitle": "Genotype", "key": "genotype" }, "num_date": { "menuItem": "date", "type": "continuous", "legendTitle": "Sampling date", "key": "num_date" } } for trait in cmaps: mjson["color_options"][trait] = { "menuItem": trait, "type": "discrete", "color_map": cmaps[trait], "legendTitle": trait, "key": trait } mjson["panels"] = ["tree", "map", "entropy"] mjson["title"] = "NextTB" mjson["maintainer"] = "Emma Hodcroft" mjson["geo"] = {} lat_long_defs = load_lat_long_defs() for geo_trait in ['region', "country", 'division']: mjson["geo"][geo_trait] = {} for n in T.find_clades(): if geo_trait in n.attr: place = n.attr[geo_trait] if (place not in mjson["geo"][geo_trait] and place in lat_long_defs): mjson["geo"][geo_trait][place] = lat_long_defs[place] mjson["commit"] = "unknown" mjson["filters"] = ["country", "region", "division"] genes = load_features(reference) anno = {} for feat, aln_fname in get_genes_and_alignments(path, tree=False): if feat in genes: anno[feat] = { "start": int(genes[feat].location.start), "end": int(genes[feat].location.end), "strand": genes[feat].location.strand } if isvcf: #if vcf, there is no 'gene' called 'nuc' that will be read in #above, so manually include it here. from filenames import ref_fasta from Bio import SeqIO refSeq = SeqIO.parse(ref_fasta(path), format='fasta').next() anno['nuc'] = {"start": 1, "end": len(refSeq.seq), "strand": 1} mjson["annotations"] = anno write_json(mjson, meta_json(path, prefix), indent=indent)
def generate_matrix_with_label(raw_item_file_path, uipairs_features_file_path, users_features_file_path, items_features_file_path, categorys_features_file_path, ucpairs_features_file_path, label_file_path, begin_date, end_date): print "\n" + begin_date + "---" + end_date + "generating matrix with label..." users_column_index, users_features = load_features( users_features_file_path) items_column_index, items_features = load_features( items_features_file_path) categorys_column_index, categorys_features = load_features( categorys_features_file_path) uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path) uipairs_features_file = open(uipairs_features_file_path) matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix-label.csv" matrix_file = open(matrix_file_path, 'w') # 加载item category字典 item_category_dict = {} raw_item_file = open(raw_item_file_path) for line in raw_item_file: line_entrys = line.strip().split(delimiter) item_id = line_entrys[0] category_id = line_entrys[2] item_category_dict[item_id] = category_id raw_item_file.close() label_set = set() label_file = open(label_file_path) for line in label_file: line_entrys = line.split(delimiter) ui_id = delimiter.join(line_entrys[0:2]) if line_entrys[2] == '4': label_set.add(ui_id) label_file.close() # 读取列名 users_features_file = open(users_features_file_path) items_features_file = open(items_features_file_path) categorys_features_file = open(categorys_features_file_path) ucpairs_features_file = open(ucpairs_features_file_path) ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \ users_features_file.readline().split(delimiter)[1:-1] + \ items_features_file.readline().split(delimiter)[1:-1] + \ categorys_features_file.readline().split(delimiter)[1:-1] + \ ucpairs_features_file.readline().split(delimiter)[2:-1] # matrix_file.write(delimiter.join(ui_column_name) + ",label\n") users_features_file.close() items_features_file.close() categorys_features_file.close() ucpairs_features_file.close() for line in uipairs_features_file: line_entrys = line.split(delimiter) user_id = line_entrys[0] item_id = line_entrys[1] # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \ # delimiter.join(users_features[user_id]) + delimiter + \ # delimiter.join(items_features[item_id]) + \ # delimiter.join(categorys_features[item_id]) + \ # delimiter.join(uc_features[ui_id]) + "\n" matrix_line = line_entrys[:-1] + \ users_features[user_id] + \ items_features[item_id] + \ categorys_features[item_category_dict[item_id]] + \ uc_features[item_category_dict[item_id]] label = "0" if (user_id + "," + item_id) in label_set: label = "1" matrix_file.write(delimiter.join(matrix_line) + "," + label + "\n") matrix_file.close() uipairs_features_file.close() print "generate matrix with label completed\n" return matrix_file_path # path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))+'\\source' # os.chdir(path) # change dir to '~/files' # # uipairs_features_file_path = "./feature/2014-12-9-2014-12-18-uifeat.csv" # users_features_file_path = "./feature/2014-12-9-2014-12-18-userfeat.csv" # items_features_file_path = "./feature/2014-12-9-2014-12-18-itemfeat.csv" # # generate_matrix(uipairs_features_file_path, users_features_file_path, items_features_file_path, "2014-12-9", "2014-12-18")
def generate_matrix(raw_item_file_path, uipairs_features_file_path, users_features_file_path, items_features_file_path, categorys_features_file_path, ucpairs_features_file_path, begin_date, end_date): print "\n" + begin_date + "---" + begin_date + "generating matrix..." users_column_index, users_features = load_features( users_features_file_path) items_column_index, items_features = load_features( items_features_file_path) categorys_column_index, categorys_features = load_features( categorys_features_file_path) uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path) uipairs_features_file = open(uipairs_features_file_path) matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix.csv" matrix_file = open(matrix_file_path, 'w') # 加载item category字典 item_category_dict = {} raw_item_file = open(raw_item_file_path) for line in raw_item_file: line_entrys = line.strip().split(delimiter) item_id = line_entrys[0] category_id = line_entrys[2] item_category_dict[item_id] = category_id raw_item_file.close() # 读取列名 users_features_file = open(users_features_file_path) items_features_file = open(items_features_file_path) categorys_features_file = open(categorys_features_file_path) ucpairs_features_file = open(ucpairs_features_file_path) ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \ users_features_file.readline().split(delimiter)[1:-1] + \ items_features_file.readline().split(delimiter)[1:-1] + \ categorys_features_file.readline().split(delimiter)[1:-1] + \ ucpairs_features_file.readline().split(delimiter)[2:-1] matrix_file.write(delimiter.join(ui_column_name) + "\n") users_features_file.close() items_features_file.close() categorys_features_file.close() ucpairs_features_file.close() for line in uipairs_features_file: line_entrys = line.split(delimiter) user_id = line_entrys[0] item_id = line_entrys[1] # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \ # delimiter.join(users_features[user_id]) + delimiter + \ # delimiter.join(items_features[item_id]) + \ # delimiter.join(categorys_features[item_id]) + \ # delimiter.join(uc_features[ui_id]) + "\n" matrix_line = line_entrys[:-1] + \ users_features[user_id] + \ items_features[item_id] + \ categorys_features[item_category_dict[item_id]] + \ uc_features[item_category_dict[item_id]] matrix_file.write(delimiter.join(matrix_line) + "\n") matrix_file.close() uipairs_features_file.close() print "generate matrix completed\n" return matrix_file_path
def load(): return util.load_features(FILE_NAME, index_col='MRid')
def generate_matrix_with_label(raw_item_file_path, uipairs_features_file_path, users_features_file_path, items_features_file_path, categorys_features_file_path, ucpairs_features_file_path, label_file_path, begin_date, end_date): print "\n" + begin_date + "---" + end_date + "generating matrix with label..." users_column_index, users_features = load_features(users_features_file_path) items_column_index, items_features = load_features(items_features_file_path) categorys_column_index, categorys_features = load_features(categorys_features_file_path) uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path) uipairs_features_file = open(uipairs_features_file_path) matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix-label.csv" matrix_file = open(matrix_file_path, 'w') # 加载item category字典 item_category_dict = {} raw_item_file = open(raw_item_file_path) for line in raw_item_file: line_entrys = line.strip().split(delimiter) item_id = line_entrys[0] category_id = line_entrys[2] item_category_dict[item_id] = category_id raw_item_file.close() label_set = set() label_file = open(label_file_path) for line in label_file: line_entrys = line.split(delimiter) ui_id = delimiter.join(line_entrys[0:2]) if line_entrys[2] == '4': label_set.add(ui_id) label_file.close() # 读取列名 users_features_file = open(users_features_file_path) items_features_file = open(items_features_file_path) categorys_features_file = open(categorys_features_file_path) ucpairs_features_file = open(ucpairs_features_file_path) ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \ users_features_file.readline().split(delimiter)[1:-1] + \ items_features_file.readline().split(delimiter)[1:-1] + \ categorys_features_file.readline().split(delimiter)[1:-1] + \ ucpairs_features_file.readline().split(delimiter)[2:-1] # matrix_file.write(delimiter.join(ui_column_name) + ",label\n") users_features_file.close() items_features_file.close() categorys_features_file.close() ucpairs_features_file.close() for line in uipairs_features_file: line_entrys = line.split(delimiter) user_id = line_entrys[0] item_id = line_entrys[1] # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \ # delimiter.join(users_features[user_id]) + delimiter + \ # delimiter.join(items_features[item_id]) + \ # delimiter.join(categorys_features[item_id]) + \ # delimiter.join(uc_features[ui_id]) + "\n" matrix_line = line_entrys[:-1] + \ users_features[user_id] + \ items_features[item_id] + \ categorys_features[item_category_dict[item_id]] + \ uc_features[item_category_dict[item_id]] label = "0" if (user_id+","+item_id) in label_set: label = "1" matrix_file.write(delimiter.join(matrix_line) + "," + label + "\n") matrix_file.close() uipairs_features_file.close() print "generate matrix with label completed\n" return matrix_file_path # path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))+'\\source' # os.chdir(path) # change dir to '~/files' # # uipairs_features_file_path = "./feature/2014-12-9-2014-12-18-uifeat.csv" # users_features_file_path = "./feature/2014-12-9-2014-12-18-userfeat.csv" # items_features_file_path = "./feature/2014-12-9-2014-12-18-itemfeat.csv" # # generate_matrix(uipairs_features_file_path, users_features_file_path, items_features_file_path, "2014-12-9", "2014-12-18")
def main(): dataset = None if len(sys.argv) > 1: dataset = sys.argv[1] metadata = util.get_metadata((dataset + "_metadata") if dataset else None) mfcc = dict( zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None))) # Load pyAudioAnalysis features with open("F", "rb") as f: feats, files = pickle.load(f, encoding="latin1") files = [f.split(".")[0].split("XC")[-1] for f in files] F = dict(zip(files, feats)) full_dataset = True for item in metadata[1:]: if item[0] not in F: full_dataset = False X2, X3 = [], [] if full_dataset: X3 = [ np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:] ] X2 = [F[item[0]] for item in metadata[1:]] X1 = [mfcc[item[0]] for item in metadata[1:]] #X = util.load_features((dataset + "_features") if dataset else None) for X in [X1, X2]: labels = [] avg_mat = None all_sims = dict() Y = util.load_labels((dataset + "_metadata") if dataset else None) samples = range( len(X)) #range(1, len(X), 12)#random.sample(range(len(X)), 25) samps = range(len(X)) #samples x = [X[i] for i in samps] y = [Y[i] for i in samples] N_ESTIMATORS = 80 NUM_RUNS = 5 for run in range(NUM_RUNS): clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_features=25, oob_score=True).fit(X, Y) similarity = dict() for dt in clf.estimators_: leaves = dt.apply(X) for i in samps: for j in samps: if leaves[i] == leaves[j]: similarity[(i, j)] = similarity.get( (i, j), 0) + (1 / N_ESTIMATORS) species_similarity = dict() for i in samps: for j in samps: species_similarity[(Y[i], Y[j])] = species_similarity.get( (Y[i], Y[j]), 0) + similarity.get( (i, j), 0)**2 / (Y.count(Y[i]) * Y.count(Y[j])) for k in species_similarity: species_similarity[k] = species_similarity[k]**(0.5) labels = clf.classes_ for i in range(len(labels)): normal = species_similarity[(labels[i], labels[i])] for j in range(i, len(labels)): k = labels[i], labels[j] species_similarity[k] /= normal species_similarity[(k[1], k[0])] = species_similarity[k] all_sims[k] = all_sims.get( k, 0) + species_similarity[k] / NUM_RUNS mat = np.array([[(1.0 - species_similarity.get((i, j), 0))**2 for j in labels] for i in labels]) print(mat) mat = squareform(mat) if avg_mat is None: avg_mat = mat else: avg_mat = np.add(avg_mat, mat) avg_mat = avg_mat / NUM_RUNS print(avg_mat) for k in all_sims: if k[0] != k[1] and all_sims[k] > 0.1: print("{}\t{}\t{}".format(k[0], k[1], all_sims[k])) linkage_matrix = linkage(avg_mat, "single") matplotlib.rcParams['lines.linewidth'] = 2.5 dendrogram(linkage_matrix, color_threshold=0.65, labels=labels, show_leaf_counts=True) plt.xlabel("label") plt.ylabel("distance") plt.show()
# #train_labels,_ = util.read_h5(os.path.join(prefix,output_dir,"train_selected_cnn_{}_label.h5".format(feature_name))) # train_features,train_labels = util.load_features(prefix,output_dir,feature_name,fold,"train") # test_features,test_labels = util.load_features(prefix,test_output_dir,feature_name,fold,"test") # #compute_features(train_features,train_labels,"fisher_{}.npy".format(fold),method="fisher") # load_feature_sets("fisher",feature_num=1000,padded=True,save=False) # num of feature vs acc/f1 #for i in [10,20,50,100,1000]: for i in [1000]: accs, mf1s, wf1s = [], [], [] for j in [1, 2, 3]: fold = j feature_name = "embo{}_norm".format(fold) #model_name="train_combined{}_multiple_norm".format(fold) train_features, train_labels = util.load_features( prefix, output_dir, feature_name, fold, "train") test_features, test_labels = util.load_features( prefix, output_dir, feature_name, fold, "test") acc, mf1, wf1, conf = load_feature_sets("fisher", i, save=False, padded=False) if j == 1: confs = conf else: confs += conf accs.append(acc) mf1s.append(mf1) wf1s.append(wf1) print("mean acc", np.mean(accs), "std acc", np.std(accs)) print("mean weighted F1 scores", np.mean(wf1s), "std weighted F1 scores", np.std(wf1s)) print("mean macro F1 scores", np.mean(mf1s), "std macro F1 scores",
def PCA_analysis(feature_num=5, fold=1, method="mrmr", cluster_method="one-vs-one"): all_idx = [] feature_name = "combined{}_multiple_norm".format(fold) if cluster_method == "one-vs-one": for count, combo in enumerate(combinations(labels_dict.items(), 2)): score, idx = load_features("{}_{}_{}_{}.npy".format( method, combo[0][0], combo[1][0], fold)) all_idx.extend(idx[:feature_num]) #all_idx.extend(get_most_important_features(os.path.join(method,"{}_feature_importance_{}_{}.txt".format(method,combo[0][0],combo[1][0])),idx=True)[:feature_num]) elif cluster_method == "one-vs-all": for key in labels_dict.keys(): score, idx = load_features("{}_{}_{}.npy".format( method, key, fold)) all_idx.extend(idx[:feature_num]) #all_idx.extend(get_most_important_features(os.path.join(method,"{}_feature_importance_{}.txt".format(method,key)),idx=True)[:feature_num]) elif cluster_method == "overlap": for key in labels_dict.keys(): feature_dict = find_over_lap_features(method, key, True) for k, v in feature_dict.items(): all_idx.extend(v[:feature_num]) all_idx = list(set(all_idx)) score, all_idx_overlap = load_features("overlap_at_least_3.npy") intersection_idx = list(set(all_idx_overlap).intersection(set(all_idx))) for i in intersection_idx: print(feature_names[i]) train_features, train_labels = util.load_features(prefix, output_dir, feature_name, fold, "train") train_features_selected = train_features[:, all_idx] test_features, test_labels = util.load_features(prefix, output_dir, feature_name, fold, "test") test_features_selected = test_features[:, all_idx] print("original", test_features_selected.shape) lda = LDA(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001) test_features_selected = lda.fit_transform(test_features_selected, test_labels) print("lda", test_features_selected.shape) #test_features_selected=PCA(n_components=2).fit_transform(test_features_selected) tf = TSNE(n_components=2, perplexity=30).fit_transform(test_features_selected) colors = ['r', 'darkgreen', 'y', 'c', 'b'] curr_colors = np.asarray([colors[int(i)] for i in test_labels]) selected_idx = get_num_samples(test_labels) selected_colors = curr_colors[selected_idx] plt.scatter(tf[selected_idx, 0], tf[selected_idx, 1], c=selected_colors, alpha=0.7) #plt.scatter(tf[:,0],tf[:,1],c=curr_colors,alpha=0.7) legend_elements = [ Line2D([0], [0], marker='o', color='w', label='CRY', markerfacecolor='r', markersize=10), Line2D([0], [0], marker='o', color='w', label='FUS', markerfacecolor='darkgreen', markersize=10), Line2D([0], [0], marker='o', color='w', label='LAU', markerfacecolor='y', markersize=10), Line2D([0], [0], marker='o', color='w', label='BAB', markerfacecolor='c', markersize=10), Line2D([0], [0], marker='o', color='w', label='SCR', markerfacecolor='b', markersize=10) ] plt.legend(handles=legend_elements, loc="upper right") #plt.show() #plt.savefig("overlap_at_least_3_reduced_bab_600_other_120.png") plt.savefig("{}/{}_top_{}_reduced_bab_600_other_120.png".format( method, cluster_method, feature_num)) plt.close()
def generate_matrix(raw_item_file_path, uipairs_features_file_path, users_features_file_path, items_features_file_path, categorys_features_file_path, ucpairs_features_file_path, begin_date, end_date): print "\n" + begin_date + "---" + begin_date + "generating matrix..." users_column_index, users_features = load_features(users_features_file_path) items_column_index, items_features = load_features(items_features_file_path) categorys_column_index, categorys_features = load_features(categorys_features_file_path) uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path) uipairs_features_file = open(uipairs_features_file_path) matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix.csv" matrix_file = open(matrix_file_path, 'w') # 加载item category字典 item_category_dict = {} raw_item_file = open(raw_item_file_path) for line in raw_item_file: line_entrys = line.strip().split(delimiter) item_id = line_entrys[0] category_id = line_entrys[2] item_category_dict[item_id] = category_id raw_item_file.close() # 读取列名 users_features_file = open(users_features_file_path) items_features_file = open(items_features_file_path) categorys_features_file = open(categorys_features_file_path) ucpairs_features_file = open(ucpairs_features_file_path) ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \ users_features_file.readline().split(delimiter)[1:-1] + \ items_features_file.readline().split(delimiter)[1:-1] + \ categorys_features_file.readline().split(delimiter)[1:-1] + \ ucpairs_features_file.readline().split(delimiter)[2:-1] matrix_file.write(delimiter.join(ui_column_name) + "\n") users_features_file.close() items_features_file.close() categorys_features_file.close() ucpairs_features_file.close() for line in uipairs_features_file: line_entrys = line.split(delimiter) user_id = line_entrys[0] item_id = line_entrys[1] # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \ # delimiter.join(users_features[user_id]) + delimiter + \ # delimiter.join(items_features[item_id]) + \ # delimiter.join(categorys_features[item_id]) + \ # delimiter.join(uc_features[ui_id]) + "\n" matrix_line = line_entrys[:-1] + \ users_features[user_id] + \ items_features[item_id] + \ categorys_features[item_category_dict[item_id]] + \ uc_features[item_category_dict[item_id]] matrix_file.write(delimiter.join(matrix_line) + "\n") matrix_file.close() uipairs_features_file.close() print "generate matrix completed\n" return matrix_file_path