예제 #1
0
def run():
	# If prepared file does not exist yet, create it
	if not os.path.isfile(FREESURFER_FILE_PREP):
		# Load FreeSurfer features
		features = util.load_features(FREESURFER_FILE)
		# Perform age-matching
		features = prepare.match_ages(features, 'HC', 'SZ', age_diff=2)
		# Remove constant features
		features = prepare.remove_constant_features(features)
		# Normalize numerical features across subjects (excluding 'age')
		features = prepare.normalize_across_subjects(features, exclude=['age'])
		# Residualize features for age, gender and total intracranial volume
		features = prepare.residualize(features, ['age', 'gender', 'EstimatedTotalIntraCranialVol'])
		# Remove highly correlated features
		features = prepare.remove_correlated_features(features)
		# Remove certain columns
		features = prepare.remove_features(features, ['diagnosis', 'age', 'gender'])
		# Write prepared freesurfer features back to file
		util.save_features(FREESURFER_FILE_PREP, features)
	else:
		# Load prepared features
		features = util.load_features(FREESURFER_FILE_PREP)

	# Run DBSCAN on features
	dbscan = DBSCAN(epsilon=1.0, min_pts=5)
	dbscan.run(features)
def get_all_combinations_of_variables(category):
    # List for all the variables involved
#     random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    # Dictionary to store the variable values
    dict_variables = {}
    dict_values = {}
    
    features = load_features(category)
    
#     #Writing to disk
#     if category == "cursive":    
#         pickle.dump(features, open("../total_features_cursive.p", "wb"))
#     elif category == "printed":
#         pickle.dump(features, open("../total_features_printed.p", "wb"))    
#     
#     print len(features)
    # Generating all combinations of length 2 
    for random_var in range(0, len(random_variables) + 1):
        for variable_pair in itertools.combinations(random_variables, random_var):
            if len(variable_pair) == 2:
                dict_values = get_dict_values(features, variable_pair, category)
                dict_variables[variable_pair] = dict_values
                # print(variable_pair)
    
    return dict_variables
def get_all_combinations_of_variables(category):
    # List for all the variables involved
    #     random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    # Dictionary to store the variable values
    dict_variables = {}
    dict_values = {}

    features = load_features(category)

    #     #Writing to disk
    #     if category == "cursive":
    #         pickle.dump(features, open("../total_features_cursive.p", "wb"))
    #     elif category == "printed":
    #         pickle.dump(features, open("../total_features_printed.p", "wb"))
    #
    #     print len(features)
    # Generating all combinations of length 2
    for random_var in range(0, len(random_variables) + 1):
        for variable_pair in itertools.combinations(random_variables,
                                                    random_var):
            if len(variable_pair) == 2:
                dict_values = get_dict_values(features, variable_pair,
                                              category)
                dict_variables[variable_pair] = dict_values
                # print(variable_pair)

    return dict_variables
예제 #4
0
def translate_vcf(vcf_fname, reference, path, feature_names=None):
    import time
    start = time.time()
    try:
        vcf_dict = read_in_vcf(vcf_fname, ref_fasta(path))
    except:
        print "Loading input alignment failed!: {}".format(vcf_fname)
    end = time.time()
    print "Reading in VCF took {}".format(str(end-start))

    start = time.time()
    featN = np.array(feature_names)
    selected_features = load_features(reference, feature_names)
    print "Translating {} genes...".format(len(selected_features))
    end = time.time()
    print "Reading in genes took {}".format(str(end-start))

    ref = vcf_dict['reference']
    sequences = vcf_dict['sequences']

    prots = {}
    deleted = []
    notMult3 = []

    start = time.time()
    #if genes have no mutations across sequences, they are dropped here from further analysis
    #check that gene lengths are multiples of 3. The first occurance causes an error in
    #Biopython, but subsequent ones do not - so make it ourselves.
    for fname,feature in selected_features.items():
        if len(str(feature.extract( SeqRecord(seq=Seq(ref)) ).seq))%3 != 0:
            notMult3.append(fname)

        prot_dict = translate_vcf_feature(sequences, ref, feature)
        if prot_dict is not None:
            prots[fname] = prot_dict
        else:
            deleted.append(fname)

    end = time.time()
    print "Translations took {}".format(str(end-start))

    start = time.time()
    #print out VCF of proteins
    #in new augur, set compress depending on input file ending!
    write_VCF_translation(prots, translation_vcf_file(path), translation_ref_file(path), compress=False)
    end = time.time()
    print "Writing out VCF took {}".format(str(end-start))

    #warn of those that don't have a length mult of 3
    print "WARNING: These genes do not have lengths that are a multiple of 3!\n{}".format(str(notMult3))

    #print dropped genes to a text file
    if len(deleted) != 0:
        with open(dropped_genes(path), 'w') as the_file:
            for d in deleted:
                the_file.write(d+"\n")
        print "{} genes had no mutations and so will be excluded. Excluded genes can be found in {}".format(len(deleted), dropped_genes(path))

    return prots
예제 #5
0
def run_model(train_data,
              test_data,
              n_trees,
              submit_id,
              model,
              save_model=False):
    '''
  Trains a model of the specified type and size on the training data,
  then predicts on the test data and writes out a submission.
  
  Args:
    train_data - bare training feature set name without path or extension 
    test_data - bare test feature set name without path or extension
    n_trees - number of trees to use in model
    submit_id - the result is written as submissions/submission_<submit_id>.csv
    model - a string...either 'rf' or 'extra'
    save_model - default False. If true, use joblib to dump the model at:
      paths.MODELS/<submit_id>_model.job

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
    start = datetime.now()
    train = util.load_features(train_data)
    drops = util.get_drop_cols(train)
    train.drop(drops, axis=1, inplace=True)
    print 'training set size: (%d, %d)' % train.shape
    print 'Training...'
    if model == 'rf':
        model = train_rf(train, n_trees)
    else:
        model = train_extra_trees(train, n_trees)
    if save_model:
        model_path = os.path.join(paths.MODELS, submit_id + '_model.job')
        joblib.dump(model, model_path)
    del train
    print 'Predicting...'
    test = util.load_features(test_data)
    test.drop(drops, axis=1, inplace=True)
    print 'test set size: (%d, %d)' % test.shape
    result = predict(model, test)
    submission_name = 'submission_%s.csv' % str(submit_id)
    submission = os.path.join(paths.SUBMIT, submission_name)
    result.to_csv(submission, index=False)
    finish = datetime.now()
    print 'Run finished: %d sec.' % (finish - start).seconds
예제 #6
0
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)
    
    mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None)))

    # Load pyAudioAnalysis features
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], []
    if full_dataset:
        X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]]
        X2 = [F[item[0]] for item in metadata[1:]]
    X1 = [mfcc[item[0]] for item in metadata[1:]]

    for X in [X1, X2]:
        NUM_RUNS = 50
        Y = util.load_labels((dataset + "_metadata") if dataset else None)
        samples = range(len(X))#range(1, len(X), 12)#random.sample(range(len(X)), 25)
        samps = samples#range(len(X))#samples 
        x = [X[i] for i in samps]
        y = [Y[i] for i in samples]
        N_ESTIMATORS = 20
        avg_mat = None 

        for run in range(NUM_RUNS): 
            clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_features=20, oob_score=True).fit(X, Y)
            similarity = dict()
            for dt in clf.estimators_:
                leaves = dt.apply(X)
                for i in samps:
                    for j in samps:
                        if leaves[i] == leaves[j]:
                            similarity[(i,j)] = similarity.get((i,j), 0) + 1

            mat = np.array([[(1.0 - similarity.get((i,j), 0)/N_ESTIMATORS)**2 for j in samples] for i in samples])
            mat = squareform(mat)
            if avg_mat is None:
                avg_mat = mat
            else:
                avg_mat = np.add(avg_mat, mat)  
        avg_mat = avg_mat / NUM_RUNS
        linkage_matrix = linkage(avg_mat, "single")
        matplotlib.rcParams['lines.linewidth'] = 2.5
        dendrogram(linkage_matrix, color_threshold=0.8, labels=y, show_leaf_counts=True)
        plt.xlabel("label")
        plt.ylabel("distance")
        plt.show()
예제 #7
0
def run():

    for center in CONFIG.keys():

        # Load meta data and give it a new index based on measurement and subject ID
        meta_data_file_path = CONFIG[center]['meta']
        meta_data = pd.read_excel(meta_data_file_path, header=3)
        index = []
        for i in range(len(meta_data.index)):
            mid = meta_data.iloc[i][meta_data.columns[0]]
            sid = meta_data.iloc[i][meta_data.columns[1]]
            index.append('{}_{}_sMRI'.format(sid, mid))
        meta_data['id'] = pd.Series(index)
        meta_data.set_index('id', drop=True, inplace=True)

        # Load feature data
        features_file_path = CONFIG[center]['features']
        features = util.load_features(features_file_path, index_col='MRid')

        try:
            # Select rows in meta data corresponding to subject IDs in feature data.
            # Currently, there seems to be something wrong with the CIMH data, that
            # is, there's no overlap in subject IDs at all...
            # TODO: Wait for Emanuel to explain
            meta_data = meta_data.loc[features.index]
        except KeyError as e:
            print(
                'Subject IDs feature data do not match meta data {}'.format(e))
            continue

        meta_data = meta_data[meta_data['Gender [m/f]'].notnull()]

        # Convert gender values to standardized format
        for idx in meta_data.index:
            gender = meta_data.loc[idx]['Gender [m/f]']
            meta_data.set_value(idx, 'Gender [m/f]', to_gender(gender))

        # Add columns to original feature data
        features['Center'] = center
        features['Age'] = meta_data['Age [years]']
        features['Gender'] = meta_data['Gender [m/f]']
        features['Diagnosis'] = meta_data['Diagnosis']
        CONFIG[center]['features_ext'] = features

    # Concatenate feature data sets
    features = pd.concat([
        CONFIG['CIMH']['features_ext'],
        CONFIG['UIO']['features_ext'],
        CONFIG['UNIBA']['features_ext'],
        CONFIG['UNICH']['features_ext'],
    ])

    # Save concatenated feature data back to CSV file
    util.save_features(OUTPUT_FILE, features, index_label='MRid')
예제 #8
0
def translate(aln_fname, reference, feature_names, name_func):
    try:
        aln = AlignIO.read(aln_fname, 'fasta')
    except:
        print("Loading input alignment failed!:", aln_fname)

    selected_features = load_features(reference, feature_names)

    for fname, feature in selected_features.items():
        translation = translate_feature(aln, feature)
        AlignIO.write(translation, name_func(fname), 'fasta')
def run_model(train_data, test_data, n_trees, submit_id, model, save_model=False):
  '''
  Trains a model of the specified type and size on the training data,
  then predicts on the test data and writes out a submission.
  
  Args:
    train_data - bare training feature set name without path or extension 
    test_data - bare test feature set name without path or extension
    n_trees - number of trees to use in model
    submit_id - the result is written as submissions/submission_<submit_id>.csv
    model - a string...either 'rf' or 'extra'
    save_model - default False. If true, use joblib to dump the model at:
      paths.MODELS/<submit_id>_model.job

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
  start = datetime.now()
  train = util.load_features(train_data)
  drops = util.get_drop_cols(train)
  train.drop(drops, axis=1, inplace=True)
  print 'training set size: (%d, %d)' % train.shape
  print 'Training...'
  if model == 'rf':
    model = train_rf(train, n_trees)
  else:
    model = train_extra_trees(train, n_trees)
  if save_model:
    model_path = os.path.join(paths.MODELS, submit_id + '_model.job')
    joblib.dump(model, model_path)
  del train
  print 'Predicting...'
  test = util.load_features(test_data)
  test.drop(drops, axis=1, inplace=True)
  print 'test set size: (%d, %d)' % test.shape
  result = predict(model, test)
  submission_name = 'submission_%s.csv' % str(submit_id)
  submission = os.path.join(paths.SUBMIT, submission_name)
  result.to_csv(submission, index=False)
  finish = datetime.now()
  print 'Run finished: %d sec.' % (finish - start).seconds
예제 #10
0
def run():

    for center in CONFIG.keys():

        # Load meta data and give it a new index based on measurement and subject ID
        meta_data_file_path = CONFIG[center]['meta']
        meta_data = pd.read_excel(meta_data_file_path, header=3)
        index = []
        for i in range(len(meta_data.index)):
            mid = meta_data.iloc[i][meta_data.columns[0]]
            sid = meta_data.iloc[i][meta_data.columns[1]]
            index.append('{}_{}_sMRI'.format(sid, mid))
        meta_data['id'] = pd.Series(index)
        meta_data.set_index('id', drop=True, inplace=True)

        # Load feature data
        features_file_path = CONFIG[center]['features']
        features = util.load_features(features_file_path, index_col='MRid')

        try:
            # Select rows in meta data corresponding to subject IDs in feature data.
            # Currently, there seems to be something wrong with the CIMH data, that
            # is, there's no overlap in subject IDs at all...
            # TODO: Wait for Emanuel to explain
            meta_data = meta_data.loc[features.index]
        except KeyError as e:
            print('Subject IDs feature data do not match meta data {}'.format(e))
            continue

        meta_data = meta_data[meta_data['Gender [m/f]'].notnull()]

        # Convert gender values to standardized format
        for idx in meta_data.index:
            gender = meta_data.loc[idx]['Gender [m/f]']
            meta_data.set_value(idx, 'Gender [m/f]', to_gender(gender))

        # Add columns to original feature data
        features['Center'] = center
        features['Age'] = meta_data['Age [years]']
        features['Gender'] = meta_data['Gender [m/f]']
        features['Diagnosis'] = meta_data['Diagnosis']
        CONFIG[center]['features_ext'] = features

    # Concatenate feature data sets
    features = pd.concat([
        CONFIG['CIMH']['features_ext'],
        CONFIG['UIO']['features_ext'],
        CONFIG['UNIBA']['features_ext'],
        CONFIG['UNICH']['features_ext'],
    ])

    # Save concatenated feature data back to CSV file
    util.save_features(OUTPUT_FILE, features, index_label='MRid')
예제 #11
0
def export_diversity(path, prefix, reference, indent=None):
    '''
    write the alignment entropy of each alignment (nucleotide and translations) to file
    '''
    genes = load_features(reference)
    entropy_json = {}
    for feat, aln_fname in get_genes_and_alignments(path, tree=False):
        entropy = diversity_statistics(aln_fname, nuc=feat == 'nuc')
        S = [max(0, round(x, 4)) for x in entropy]
        n = len(S)
        if feat == 'nuc':
            entropy_json[feat] = {
                'pos': range(0, n),
                'codon': [x // 3 for x in range(0, n)],
                'val': S
            }
        elif feat in genes:
            entropy_json[feat] = {
                'pos': [x for x in genes[feat]][::3],
                'codon': range(n),
                'val': S
            }
    write_json(entropy_json, diversity_json(path, prefix), indent=indent)
예제 #12
0
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)
    
    mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None)))
    feats, files = None,None
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], [] 
    if full_dataset:
        X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]]
        X2 = [F[item[0]] for item in metadata[1:]] 
    X1 = [mfcc[item[0]] for item in metadata[1:]] 
    Y = util.load_labels((dataset + "_metadata") if dataset else None)#"bbsmd.csv")

    for X in [X1, X2] if full_dataset else [X1,]:
        print("------")
       
        classifiers = [ RandomForestClassifier(n_estimators=50, max_features=15, oob_score=True),
            KNeighborsClassifier(3),
            svm.SVC(kernel='linear', C=1),
            svm.SVC(gamma=2, C=1),
            GaussianNB()
        ]
        for clf in classifiers:
            scores = cross_val_score(clf, X, Y, cv=5)
            score = sum(scores)/len(scores)
            print(type(clf).__name__, "\t", score)
예제 #13
0
    except:
        print("Loading input alignment failed!:", aln_fname)

    selected_features = load_features(reference, feature_names)

    for fname, feature in selected_features.items():
        translation = translate_feature(aln, feature)
        AlignIO.write(translation, name_func(fname), 'fasta')


if __name__ == '__main__':
    parser = generic_argparse("Translate the nucleotide alignments")
    parser.add_argument('--reference',
                        required=True,
                        help='genbank file containing the annotation')
    parser.add_argument('--genes', nargs='+', help="genes to translate")
    args = parser.parse_args()

    path = args.path

    if not args.genes:
        genes = load_features(args.reference).keys()
    else:
        genes = args.genes

    for func in [tree_sequence_alignment, ref_alignment]:
        aln_fname = func(path, 'nuc')
        if os.path.isfile(aln_fname):
            translate(aln_fname, args.reference, genes,
                      lambda x: func(path, x))
예제 #14
0
def export_metadata_json(T, path, prefix, reference, isvcf=False, indent=1):
    print("Writing out metaprocess")
    mjson = {}

    mjson["virus_count"] = T.count_terminals()
    from datetime import date
    mjson["updated"] = date.today().strftime('%Y-%m-%d')
    mjson["author_info"] = {
        "?": {
            "paper_url": "?",
            "journal": "?",
            "title": "?",
            "n": 1
        }
    }
    mjson["seq_author_map"] = {}

    from collections import defaultdict
    cmaps = defaultdict(list)
    with open(color_maps(path), 'r') as cfile:
        for line in cfile:
            try:
                trait, name, color = line.strip().split('\t')
            except:
                continue
            cmaps[trait].append((name, color))

    #if drug-resistance colours have been auto-generated, get these too
    import os.path
    if os.path.isfile(drm_color_maps(path)):
        with open(drm_color_maps(path), 'r') as cfile:
            for line in cfile:
                try:
                    trait, name, color = line.strip().split('\t')
                except:
                    continue
                cmaps[trait].append((name, color))

    mjson["color_options"] = {
        "gt": {
            "menuItem": "genotype",
            "type": "discrete",
            "legendTitle": "Genotype",
            "key": "genotype"
        },
        "num_date": {
            "menuItem": "date",
            "type": "continuous",
            "legendTitle": "Sampling date",
            "key": "num_date"
        }
    }
    for trait in cmaps:
        mjson["color_options"][trait] = {
            "menuItem": trait,
            "type": "discrete",
            "color_map": cmaps[trait],
            "legendTitle": trait,
            "key": trait
        }

    mjson["panels"] = ["tree", "map", "entropy"]
    mjson["title"] = "NextTB"
    mjson["maintainer"] = "Emma Hodcroft"
    mjson["geo"] = {}
    lat_long_defs = load_lat_long_defs()
    for geo_trait in ['region', "country", 'division']:
        mjson["geo"][geo_trait] = {}
        for n in T.find_clades():
            if geo_trait in n.attr:
                place = n.attr[geo_trait]
                if (place not in mjson["geo"][geo_trait]
                        and place in lat_long_defs):
                    mjson["geo"][geo_trait][place] = lat_long_defs[place]

    mjson["commit"] = "unknown"
    mjson["filters"] = ["country", "region", "division"]

    genes = load_features(reference)
    anno = {}
    for feat, aln_fname in get_genes_and_alignments(path, tree=False):
        if feat in genes:
            anno[feat] = {
                "start": int(genes[feat].location.start),
                "end": int(genes[feat].location.end),
                "strand": genes[feat].location.strand
            }

    if isvcf:
        #if vcf, there is no 'gene' called 'nuc' that will be read in
        #above, so manually include it here.
        from filenames import ref_fasta
        from Bio import SeqIO
        refSeq = SeqIO.parse(ref_fasta(path), format='fasta').next()
        anno['nuc'] = {"start": 1, "end": len(refSeq.seq), "strand": 1}

    mjson["annotations"] = anno
    write_json(mjson, meta_json(path, prefix), indent=indent)
예제 #15
0
def generate_matrix_with_label(raw_item_file_path, uipairs_features_file_path,
                               users_features_file_path,
                               items_features_file_path,
                               categorys_features_file_path,
                               ucpairs_features_file_path, label_file_path,
                               begin_date, end_date):
    print "\n" + begin_date + "---" + end_date + "generating matrix with label..."

    users_column_index, users_features = load_features(
        users_features_file_path)
    items_column_index, items_features = load_features(
        items_features_file_path)
    categorys_column_index, categorys_features = load_features(
        categorys_features_file_path)
    uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path)

    uipairs_features_file = open(uipairs_features_file_path)

    matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix-label.csv"
    matrix_file = open(matrix_file_path, 'w')

    # 加载item category字典
    item_category_dict = {}
    raw_item_file = open(raw_item_file_path)
    for line in raw_item_file:
        line_entrys = line.strip().split(delimiter)
        item_id = line_entrys[0]
        category_id = line_entrys[2]
        item_category_dict[item_id] = category_id
    raw_item_file.close()

    label_set = set()
    label_file = open(label_file_path)
    for line in label_file:
        line_entrys = line.split(delimiter)
        ui_id = delimiter.join(line_entrys[0:2])
        if line_entrys[2] == '4':
            label_set.add(ui_id)
    label_file.close()

    # 读取列名
    users_features_file = open(users_features_file_path)
    items_features_file = open(items_features_file_path)
    categorys_features_file = open(categorys_features_file_path)
    ucpairs_features_file = open(ucpairs_features_file_path)
    ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \
                     users_features_file.readline().split(delimiter)[1:-1] + \
                     items_features_file.readline().split(delimiter)[1:-1] + \
                     categorys_features_file.readline().split(delimiter)[1:-1] + \
                     ucpairs_features_file.readline().split(delimiter)[2:-1]

    # matrix_file.write(delimiter.join(ui_column_name) + ",label\n")
    users_features_file.close()
    items_features_file.close()
    categorys_features_file.close()
    ucpairs_features_file.close()

    for line in uipairs_features_file:
        line_entrys = line.split(delimiter)
        user_id = line_entrys[0]
        item_id = line_entrys[1]

        # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \
        #               delimiter.join(users_features[user_id]) + delimiter + \
        #               delimiter.join(items_features[item_id]) + \
        #               delimiter.join(categorys_features[item_id]) + \
        #               delimiter.join(uc_features[ui_id]) + "\n"
        matrix_line = line_entrys[:-1] + \
                      users_features[user_id] + \
                      items_features[item_id] + \
                      categorys_features[item_category_dict[item_id]] + \
                      uc_features[item_category_dict[item_id]]

        label = "0"
        if (user_id + "," + item_id) in label_set:
            label = "1"
        matrix_file.write(delimiter.join(matrix_line) + "," + label + "\n")

    matrix_file.close()
    uipairs_features_file.close()
    print "generate matrix with label completed\n"

    return matrix_file_path


# path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))+'\\source'
# os.chdir(path)  # change dir to '~/files'
#
# uipairs_features_file_path = "./feature/2014-12-9-2014-12-18-uifeat.csv"
# users_features_file_path = "./feature/2014-12-9-2014-12-18-userfeat.csv"
# items_features_file_path = "./feature/2014-12-9-2014-12-18-itemfeat.csv"
#
# generate_matrix(uipairs_features_file_path, users_features_file_path, items_features_file_path, "2014-12-9", "2014-12-18")
예제 #16
0
def generate_matrix(raw_item_file_path, uipairs_features_file_path,
                    users_features_file_path, items_features_file_path,
                    categorys_features_file_path, ucpairs_features_file_path,
                    begin_date, end_date):
    print "\n" + begin_date + "---" + begin_date + "generating matrix..."

    users_column_index, users_features = load_features(
        users_features_file_path)
    items_column_index, items_features = load_features(
        items_features_file_path)
    categorys_column_index, categorys_features = load_features(
        categorys_features_file_path)
    uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path)

    uipairs_features_file = open(uipairs_features_file_path)

    matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix.csv"
    matrix_file = open(matrix_file_path, 'w')

    # 加载item category字典
    item_category_dict = {}
    raw_item_file = open(raw_item_file_path)
    for line in raw_item_file:
        line_entrys = line.strip().split(delimiter)
        item_id = line_entrys[0]
        category_id = line_entrys[2]
        item_category_dict[item_id] = category_id
    raw_item_file.close()

    # 读取列名
    users_features_file = open(users_features_file_path)
    items_features_file = open(items_features_file_path)
    categorys_features_file = open(categorys_features_file_path)
    ucpairs_features_file = open(ucpairs_features_file_path)
    ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \
                     users_features_file.readline().split(delimiter)[1:-1] + \
                     items_features_file.readline().split(delimiter)[1:-1] + \
                     categorys_features_file.readline().split(delimiter)[1:-1] + \
                     ucpairs_features_file.readline().split(delimiter)[2:-1]

    matrix_file.write(delimiter.join(ui_column_name) + "\n")
    users_features_file.close()
    items_features_file.close()
    categorys_features_file.close()
    ucpairs_features_file.close()

    for line in uipairs_features_file:
        line_entrys = line.split(delimiter)
        user_id = line_entrys[0]
        item_id = line_entrys[1]

        # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \
        #               delimiter.join(users_features[user_id]) + delimiter + \
        #               delimiter.join(items_features[item_id]) + \
        #               delimiter.join(categorys_features[item_id]) + \
        #               delimiter.join(uc_features[ui_id]) + "\n"
        matrix_line = line_entrys[:-1] + \
                      users_features[user_id] + \
                      items_features[item_id] + \
                      categorys_features[item_category_dict[item_id]] + \
                      uc_features[item_category_dict[item_id]]

        matrix_file.write(delimiter.join(matrix_line) + "\n")

    matrix_file.close()
    uipairs_features_file.close()
    print "generate matrix completed\n"

    return matrix_file_path
예제 #17
0
def load():

    return util.load_features(FILE_NAME, index_col='MRid')
예제 #18
0
def generate_matrix_with_label(raw_item_file_path, uipairs_features_file_path, users_features_file_path, items_features_file_path, categorys_features_file_path, ucpairs_features_file_path, label_file_path, begin_date, end_date):
    print "\n" + begin_date + "---" + end_date + "generating matrix with label..."

    users_column_index, users_features = load_features(users_features_file_path)
    items_column_index, items_features = load_features(items_features_file_path)
    categorys_column_index, categorys_features = load_features(categorys_features_file_path)
    uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path)

    uipairs_features_file = open(uipairs_features_file_path)

    matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix-label.csv"
    matrix_file = open(matrix_file_path, 'w')

    # 加载item category字典
    item_category_dict = {}
    raw_item_file = open(raw_item_file_path)
    for line in raw_item_file:
        line_entrys = line.strip().split(delimiter)
        item_id = line_entrys[0]
        category_id = line_entrys[2]
        item_category_dict[item_id] = category_id
    raw_item_file.close()

    label_set = set()
    label_file = open(label_file_path)
    for line in label_file:
        line_entrys = line.split(delimiter)
        ui_id = delimiter.join(line_entrys[0:2])
        if line_entrys[2] == '4':
            label_set.add(ui_id)
    label_file.close()

    # 读取列名
    users_features_file = open(users_features_file_path)
    items_features_file = open(items_features_file_path)
    categorys_features_file = open(categorys_features_file_path)
    ucpairs_features_file = open(ucpairs_features_file_path)
    ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \
                     users_features_file.readline().split(delimiter)[1:-1] + \
                     items_features_file.readline().split(delimiter)[1:-1] + \
                     categorys_features_file.readline().split(delimiter)[1:-1] + \
                     ucpairs_features_file.readline().split(delimiter)[2:-1]

    # matrix_file.write(delimiter.join(ui_column_name) + ",label\n")
    users_features_file.close()
    items_features_file.close()
    categorys_features_file.close()
    ucpairs_features_file.close()

    for line in uipairs_features_file:
        line_entrys = line.split(delimiter)
        user_id = line_entrys[0]
        item_id = line_entrys[1]

        # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \
        #               delimiter.join(users_features[user_id]) + delimiter + \
        #               delimiter.join(items_features[item_id]) + \
        #               delimiter.join(categorys_features[item_id]) + \
        #               delimiter.join(uc_features[ui_id]) + "\n"
        matrix_line = line_entrys[:-1] + \
                      users_features[user_id] + \
                      items_features[item_id] + \
                      categorys_features[item_category_dict[item_id]] + \
                      uc_features[item_category_dict[item_id]]

        label = "0"
        if (user_id+","+item_id) in label_set:
            label = "1"
        matrix_file.write(delimiter.join(matrix_line) + "," + label + "\n")

    matrix_file.close()
    uipairs_features_file.close()
    print "generate matrix with label completed\n"

    return matrix_file_path

# path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))+'\\source'
# os.chdir(path)  # change dir to '~/files'
#
# uipairs_features_file_path = "./feature/2014-12-9-2014-12-18-uifeat.csv"
# users_features_file_path = "./feature/2014-12-9-2014-12-18-userfeat.csv"
# items_features_file_path = "./feature/2014-12-9-2014-12-18-itemfeat.csv"
#
# generate_matrix(uipairs_features_file_path, users_features_file_path, items_features_file_path, "2014-12-9", "2014-12-18")
예제 #19
0
파일: similarity.py 프로젝트: Shwam/Birds
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)

    mfcc = dict(
        zip([metadata[i][0] for i in range(1, len(metadata))],
            util.load_features((dataset + "_features") if dataset else None)))

    # Load pyAudioAnalysis features
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], []
    if full_dataset:
        X3 = [
            np.concatenate((F[item[0]], mfcc[item[0]]), axis=0)
            for item in metadata[1:]
        ]
        X2 = [F[item[0]] for item in metadata[1:]]
    X1 = [mfcc[item[0]] for item in metadata[1:]]

    #X = util.load_features((dataset + "_features") if dataset else None)
    for X in [X1, X2]:
        labels = []
        avg_mat = None
        all_sims = dict()
        Y = util.load_labels((dataset + "_metadata") if dataset else None)
        samples = range(
            len(X))  #range(1, len(X), 12)#random.sample(range(len(X)), 25)
        samps = range(len(X))  #samples
        x = [X[i] for i in samps]
        y = [Y[i] for i in samples]

        N_ESTIMATORS = 80
        NUM_RUNS = 5

        for run in range(NUM_RUNS):
            clf = RandomForestClassifier(n_estimators=N_ESTIMATORS,
                                         max_features=25,
                                         oob_score=True).fit(X, Y)
            similarity = dict()
            for dt in clf.estimators_:
                leaves = dt.apply(X)
                for i in samps:
                    for j in samps:
                        if leaves[i] == leaves[j]:
                            similarity[(i, j)] = similarity.get(
                                (i, j), 0) + (1 / N_ESTIMATORS)

            species_similarity = dict()
            for i in samps:
                for j in samps:
                    species_similarity[(Y[i], Y[j])] = species_similarity.get(
                        (Y[i], Y[j]), 0) + similarity.get(
                            (i, j), 0)**2 / (Y.count(Y[i]) * Y.count(Y[j]))

            for k in species_similarity:
                species_similarity[k] = species_similarity[k]**(0.5)

            labels = clf.classes_
            for i in range(len(labels)):
                normal = species_similarity[(labels[i], labels[i])]
                for j in range(i, len(labels)):
                    k = labels[i], labels[j]
                    species_similarity[k] /= normal
                    species_similarity[(k[1], k[0])] = species_similarity[k]
                    all_sims[k] = all_sims.get(
                        k, 0) + species_similarity[k] / NUM_RUNS

            mat = np.array([[(1.0 - species_similarity.get((i, j), 0))**2
                             for j in labels] for i in labels])
            print(mat)
            mat = squareform(mat)
            if avg_mat is None:
                avg_mat = mat
            else:
                avg_mat = np.add(avg_mat, mat)
        avg_mat = avg_mat / NUM_RUNS
        print(avg_mat)
        for k in all_sims:
            if k[0] != k[1] and all_sims[k] > 0.1:
                print("{}\t{}\t{}".format(k[0], k[1], all_sims[k]))
        linkage_matrix = linkage(avg_mat, "single")
        matplotlib.rcParams['lines.linewidth'] = 2.5
        dendrogram(linkage_matrix,
                   color_threshold=0.65,
                   labels=labels,
                   show_leaf_counts=True)
        plt.xlabel("label")
        plt.ylabel("distance")
        plt.show()
예제 #20
0
#     #train_labels,_ = util.read_h5(os.path.join(prefix,output_dir,"train_selected_cnn_{}_label.h5".format(feature_name)))
#     train_features,train_labels = util.load_features(prefix,output_dir,feature_name,fold,"train")
#     test_features,test_labels = util.load_features(prefix,test_output_dir,feature_name,fold,"test")
#     #compute_features(train_features,train_labels,"fisher_{}.npy".format(fold),method="fisher")
#     load_feature_sets("fisher",feature_num=1000,padded=True,save=False)

# num of feature vs acc/f1
#for i in [10,20,50,100,1000]:
for i in [1000]:
    accs, mf1s, wf1s = [], [], []
    for j in [1, 2, 3]:
        fold = j
        feature_name = "embo{}_norm".format(fold)
        #model_name="train_combined{}_multiple_norm".format(fold)

        train_features, train_labels = util.load_features(
            prefix, output_dir, feature_name, fold, "train")
        test_features, test_labels = util.load_features(
            prefix, output_dir, feature_name, fold, "test")
        acc, mf1, wf1, conf = load_feature_sets("fisher",
                                                i,
                                                save=False,
                                                padded=False)
        if j == 1: confs = conf
        else: confs += conf
        accs.append(acc)
        mf1s.append(mf1)
        wf1s.append(wf1)
    print("mean acc", np.mean(accs), "std acc", np.std(accs))
    print("mean weighted F1 scores", np.mean(wf1s), "std weighted F1 scores",
          np.std(wf1s))
    print("mean macro F1 scores", np.mean(mf1s), "std macro F1 scores",
예제 #21
0
def PCA_analysis(feature_num=5,
                 fold=1,
                 method="mrmr",
                 cluster_method="one-vs-one"):
    all_idx = []
    feature_name = "combined{}_multiple_norm".format(fold)
    if cluster_method == "one-vs-one":
        for count, combo in enumerate(combinations(labels_dict.items(), 2)):
            score, idx = load_features("{}_{}_{}_{}.npy".format(
                method, combo[0][0], combo[1][0], fold))
            all_idx.extend(idx[:feature_num])
            #all_idx.extend(get_most_important_features(os.path.join(method,"{}_feature_importance_{}_{}.txt".format(method,combo[0][0],combo[1][0])),idx=True)[:feature_num])
    elif cluster_method == "one-vs-all":
        for key in labels_dict.keys():
            score, idx = load_features("{}_{}_{}.npy".format(
                method, key, fold))
            all_idx.extend(idx[:feature_num])
            #all_idx.extend(get_most_important_features(os.path.join(method,"{}_feature_importance_{}.txt".format(method,key)),idx=True)[:feature_num])
    elif cluster_method == "overlap":
        for key in labels_dict.keys():
            feature_dict = find_over_lap_features(method, key, True)
            for k, v in feature_dict.items():
                all_idx.extend(v[:feature_num])

    all_idx = list(set(all_idx))

    score, all_idx_overlap = load_features("overlap_at_least_3.npy")
    intersection_idx = list(set(all_idx_overlap).intersection(set(all_idx)))
    for i in intersection_idx:
        print(feature_names[i])

    train_features, train_labels = util.load_features(prefix, output_dir,
                                                      feature_name, fold,
                                                      "train")
    train_features_selected = train_features[:, all_idx]
    test_features, test_labels = util.load_features(prefix, output_dir,
                                                    feature_name, fold, "test")
    test_features_selected = test_features[:, all_idx]
    print("original", test_features_selected.shape)
    lda = LDA(n_components=None,
              priors=None,
              shrinkage=None,
              solver='svd',
              store_covariance=False,
              tol=0.0001)
    test_features_selected = lda.fit_transform(test_features_selected,
                                               test_labels)
    print("lda", test_features_selected.shape)
    #test_features_selected=PCA(n_components=2).fit_transform(test_features_selected)
    tf = TSNE(n_components=2,
              perplexity=30).fit_transform(test_features_selected)
    colors = ['r', 'darkgreen', 'y', 'c', 'b']
    curr_colors = np.asarray([colors[int(i)] for i in test_labels])

    selected_idx = get_num_samples(test_labels)
    selected_colors = curr_colors[selected_idx]
    plt.scatter(tf[selected_idx, 0],
                tf[selected_idx, 1],
                c=selected_colors,
                alpha=0.7)

    #plt.scatter(tf[:,0],tf[:,1],c=curr_colors,alpha=0.7)

    legend_elements = [
        Line2D([0], [0],
               marker='o',
               color='w',
               label='CRY',
               markerfacecolor='r',
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='FUS',
               markerfacecolor='darkgreen',
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='LAU',
               markerfacecolor='y',
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='BAB',
               markerfacecolor='c',
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='SCR',
               markerfacecolor='b',
               markersize=10)
    ]
    plt.legend(handles=legend_elements, loc="upper right")
    #plt.show()
    #plt.savefig("overlap_at_least_3_reduced_bab_600_other_120.png")
    plt.savefig("{}/{}_top_{}_reduced_bab_600_other_120.png".format(
        method, cluster_method, feature_num))
    plt.close()
예제 #22
0
def generate_matrix(raw_item_file_path, uipairs_features_file_path, users_features_file_path, items_features_file_path, categorys_features_file_path, ucpairs_features_file_path, begin_date, end_date):
    print "\n" + begin_date + "---" + begin_date + "generating matrix..."

    users_column_index, users_features = load_features(users_features_file_path)
    items_column_index, items_features = load_features(items_features_file_path)
    categorys_column_index, categorys_features = load_features(categorys_features_file_path)
    uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path)

    uipairs_features_file = open(uipairs_features_file_path)

    matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix.csv"
    matrix_file = open(matrix_file_path, 'w')

    # 加载item category字典
    item_category_dict = {}
    raw_item_file = open(raw_item_file_path)
    for line in raw_item_file:
        line_entrys = line.strip().split(delimiter)
        item_id = line_entrys[0]
        category_id = line_entrys[2]
        item_category_dict[item_id] = category_id
    raw_item_file.close()

    # 读取列名
    users_features_file = open(users_features_file_path)
    items_features_file = open(items_features_file_path)
    categorys_features_file = open(categorys_features_file_path)
    ucpairs_features_file = open(ucpairs_features_file_path)
    ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \
                     users_features_file.readline().split(delimiter)[1:-1] + \
                     items_features_file.readline().split(delimiter)[1:-1] + \
                     categorys_features_file.readline().split(delimiter)[1:-1] + \
                     ucpairs_features_file.readline().split(delimiter)[2:-1]

    matrix_file.write(delimiter.join(ui_column_name) + "\n")
    users_features_file.close()
    items_features_file.close()
    categorys_features_file.close()
    ucpairs_features_file.close()

    for line in uipairs_features_file:
        line_entrys = line.split(delimiter)
        user_id = line_entrys[0]
        item_id = line_entrys[1]

        # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \
        #               delimiter.join(users_features[user_id]) + delimiter + \
        #               delimiter.join(items_features[item_id]) + \
        #               delimiter.join(categorys_features[item_id]) + \
        #               delimiter.join(uc_features[ui_id]) + "\n"
        matrix_line = line_entrys[:-1] + \
                      users_features[user_id] + \
                      items_features[item_id] + \
                      categorys_features[item_category_dict[item_id]] + \
                      uc_features[item_category_dict[item_id]]

        matrix_file.write(delimiter.join(matrix_line) + "\n")

    matrix_file.close()
    uipairs_features_file.close()
    print "generate matrix completed\n"

    return matrix_file_path