def calculate_reverse_inference_distance(query_image,in_images,out_images,standard_mask,equal_priors=True): '''calculate_reverse_inference_distance return reverse inference value based on generating likelihood scores using distance of the query image from the group ..note:: Reverse Inference Calculation ------------------------------------------------------------------ P(node mental process|activation) = P(activation|mental process) * P(mental process) divided by P(activation|mental process) * P(mental process) + P(A|~mental process) * P(~mental process) P(activation|mental process): my voxelwise prior map :param query_image: nifti image path image that we want to calculate reverse inference score for :param subset_in: list of nifti files brain maps that are defined for the concept :param subset_out: list of nifti files the rest :param equal_priors: boolean use 0.5 as a prior for each group [default True]. If set to False, the frequency of the concept in the total set will be used. "True" is recommended for small sets. ''' if len(numpy.intersect1d(in_images,out_images)) > 0: raise ValueError("ERROR: in_images and out_images should not share images!") all_images = in_images + out_images mr = get_images_df(file_paths=all_images,mask=standard_mask) mr.index = all_images in_subset = mr.loc[in_images] out_subset = mr.loc[out_images] if equal_priors: p_process_in = 0.5 p_process_out = 0.5 else: in_count = len(in_images) out_count = len(out_images) total = in_count + out_count # total number of nifti images p_process_in = float(in_count) / total # percentage of niftis in p_process_out = float(out_count) / total # percentage out # Read in the query image query = get_images_df(file_paths=query_image,mask=standard_mask) # Generate a mean image for each group mean_image_in = pandas.DataFrame(in_subset.mean()) mean_image_out = pandas.DataFrame(out_subset.mean()) # p in/out is similarity between query image and groups p_in = numpy.power(calculate_pairwise_correlation(mean_image_in[0],query[0]),2) p_out = numpy.power(calculate_pairwise_correlation(mean_image_out[0],query[0]),2) # Calculate inference numerators = p_in * p_process_in denominators = (p_in * p_process_in) + (p_out * p_process_out) return (numerators / denominators)
empty_nii = numpy.zeros(dataset.masker.volume.shape) empty_nii[dataset.masker.volume.get_data() != 0] = pmid_mr empty_nii = nibabel.Nifti1Image(empty_nii, affine=dataset.masker.volume.get_affine()) tmpnii = "%s/tmp.nii.gz" % (neurosynth_feature_maps) nibabel.save(empty_nii, tmpnii) # ***Interpolation must be nearest as neurosynth data is binary! nii = resample_img(tmpnii, target_affine=brain_4mm.get_affine(), interpolation="nearest") nibabel.save(nii, "%s/%s.nii.gz" % (neurosynth_feature_maps, pmid)) # Load into image data frame os.remove("%s/tmp.nii.gz" % (neurosynth_feature_maps)) concept_maps_4mm = glob("%s/*.nii.gz" % (neurosynth_feature_maps)) X = get_images_df(file_paths=concept_maps_4mm, mask=brain_4mm) Xindex = [ int( x.replace(".nii.gz", "").replace(neurosynth_feature_maps, "").replace("/", "")) for x in concept_maps_4mm ] X.index = Xindex ### ENCODING MODEL ## This is our "features" data frame # X=load_neurosynth_term_mappings() # size nterms X npapers - for each paper, a binary encoding of the presence/absence of each cog atlas term in the abstract # mapping=numpy.zeros(nvoxels,nterms) # neurosynth_map=load_data() # data from all voxels, size novels X npapers, binary encoding of activation presence/absence
def calculate_reverse_inference_distance(query_image, in_images, out_images, standard_mask, equal_priors=True): '''calculate_reverse_inference_distance return reverse inference value based on generating likelihood scores using distance of the query image from the group ..note:: Reverse Inference Calculation ------------------------------------------------------------------ P(node mental process|activation) = P(activation|mental process) * P(mental process) divided by P(activation|mental process) * P(mental process) + P(A|~mental process) * P(~mental process) P(activation|mental process): my voxelwise prior map :param query_image: nifti image path image that we want to calculate reverse inference score for :param subset_in: list of nifti files brain maps that are defined for the concept :param subset_out: list of nifti files the rest :param equal_priors: boolean use 0.5 as a prior for each group [default True]. If set to False, the frequency of the concept in the total set will be used. "True" is recommended for small sets. ''' if len(numpy.intersect1d(in_images, out_images)) > 0: raise ValueError( "ERROR: in_images and out_images should not share images!") all_images = in_images + out_images mr = get_images_df(file_paths=all_images, mask=standard_mask) mr.index = all_images in_subset = mr.loc[in_images] out_subset = mr.loc[out_images] if equal_priors: p_process_in = 0.5 p_process_out = 0.5 else: in_count = len(in_images) out_count = len(out_images) total = in_count + out_count # total number of nifti images p_process_in = float(in_count) / total # percentage of niftis in p_process_out = float(out_count) / total # percentage out # Read in the query image query = get_images_df(file_paths=query_image, mask=standard_mask) # Generate a mean image for each group mean_image_in = pandas.DataFrame(in_subset.mean()) mean_image_out = pandas.DataFrame(out_subset.mean()) # p in/out is similarity between query image and groups p_in = numpy.power( calculate_pairwise_correlation(mean_image_in[0], query[0]), 2) p_out = numpy.power( calculate_pairwise_correlation(mean_image_out[0], query[0]), 2) # Calculate inference numerators = p_in * p_process_in denominators = (p_in * p_process_in) + (p_out * p_process_out) return (numerators / denominators)
def get_likelihood_df(nid, in_images, out_images, standard_mask, range_table, threshold=2.96, output_folder=None, method=["binary"]): '''get_likelihood_df will calculate likelihoods and save to a pandas df pickle. The user must specify the method [default is binary]. Method details: ranges: - likelihood in all thresholds defined in image (calculate_priors in ranges) binary - likelihood above / below a certain level [threshold, default=2.96] Note: you do not need to calculate likelihoods in advance for the mean metric (using a derivation of the distance from a mean image as a probability score) In this case, use calculate_reverse_inference_distance :param nid: str a unique identifier, typically a node ID from a pybraincompare.ontology.tree :param in_images: list a list of files for the "in" group relevant to some concept :param out_images: list the rest :param standard_mask: nibabel.Nifti1Image object the standard mask images are in space of :param range_table: pandas data frame a data frame of ranges with "start" and "stop" to calculate the range is based on the mins and max of the entire set of images can be generated with pybraincompare.inference.make_range_table :param output_folder: path folder to save likelihood pickles [default is None] If output_folder is not specified, the df objects are returned. If specified, will return paths to saved pickle objects: pbc_likelihood_trm12345_df_in.pkl EACH VOXEL IS p(activation in voxel is in threshold) ''' # Read all images into one data frame if len(numpy.intersect1d(in_images, out_images)) > 0: raise ValueError( "ERROR: in_images and out_images should not share images!") all_images = in_images + out_images mr = get_images_df(file_paths=all_images, mask=standard_mask) mr.index = all_images in_subset = mr.loc[in_images] out_subset = mr.loc[out_images] # Calculate likelihood for user defined methods df = dict() if "ranges" in method: df["out_ranges"] = calculate_likelihood_in_ranges( in_subset, range_table) df["in_ranges"] = calculate_likelihood_in_ranges( out_subset, range_table) if output_folder: df["in_ranges"] = save_likelihood_pickle(df["in_ranges"], output_folder, nid, "in_ranges") df["out_ranges"] = save_likelihood_pickle(df["out_ranges"], output_folder, nid, "out_ranges") if "binary" in method: df["in_bin"] = calculate_likelihood_binary(in_subset, threshold) df["out_bin"] = calculate_likelihood_binary(out_subset, threshold) if output_folder: df["in_bin"] = save_likelihood_pickle(df["in_bin"], output_folder, nid, "in_bin_%s" % threshold) df["in_out"] = save_likelihood_pickle(df["out_bin"], output_folder, nid, "out_bin_%s" % threshold) return df
def likelihood_groups_from_tree( tree, standard_mask, input_folder, image_pattern="[0]+%s[.]", output_folder=None, node_pattern="[0-9]+", ): '''likelihood_groups_from_tree Function to generate likelihood groups from a pybraincompare.ontology.tree object. These groups can then be used to calculate likelihoods (eg, p(activation|cognitive process). The groups are output as pickle objects. This is done because it is ideal to calculate likelihoods on a cluster. :param tree: dict a dictionary of nodes, with base nodes matching a particular pattern assumed to be image (.nii.gz) files. :param standard_mask: nifti image (nibabel) standard image mask that images are registered to :param output_folder: path a folder path to save likelihood groups :param input_folder: path the folder of images to be matched to the nodes of the tree. :param pattern: str the pattern to match to find the base image nodes. Default is a number of any length [neurovault image primary keys]. :param image_pattern: str a regular expression to find image files in images_folder. Default will match any number of leading zeros, any number, and any extension. :param node_pattern: str a regular expression to find image nodes in the tree, matched to name :return groups: pickle a pickle with the following ..note:: pbc_likelihood_groups_trm_12345.pkl group["nid"] = "trm_12345" group["in"] = ["path1","path2",..."pathN"] group["out"] = ["path3","path4",..."pathM"] group["meta"]: meta data for the node group["range_table"]: a data frame of ranges with "start" and "stop" to calculate the range is based on the mins and max of the entire set of images ''' # Find all nodes in the tree, match to images in folder nodes = get_node_fields(tree, field="name", nodes=[]) contender_files = glob("%s/*" % input_folder) # Images will match the specified pattern find_nodes = re.compile(node_pattern) image_nodes = numpy.unique( [node for node in nodes if find_nodes.match(node)]).tolist() # Node names must now be matched to files file_lookup = dict() file_names = [os.path.split(path)[-1] for path in contender_files] for node in image_nodes: find_file = re.compile(image_pattern % node) idx = [file_names.index(x) for x in file_names if find_file.match(x)] if len(idx) > 1: raise ValueError( "ERROR: found %s images that match pattern %s." % len(idx), find_file.pattern) elif len(idx) == 0: print "Did not find file for %s, will not be included in analysis." % ( node) else: file_lookup[node] = contender_files[idx[0]] # Use pandas dataframe to not risk weird dictionary iteration orders files = pandas.DataFrame(file_lookup.values(), columns=["path"]) files.index = file_lookup.keys() # The remaining nodes in the tree (that are not images) will have a RI score concept_nodes = [x for x in nodes if x not in image_nodes] # create table of voxels for all images (the top node) mr = get_images_df(file_paths=files.path, mask=standard_mask) mr.index = files.index range_table = make_range_table(mr) # GROUPS ---------------------------------------------------- # Find groups for image sets at each node (**node names must be unique) # This is images at (and in lower levels) of node vs. everything else # will be used to calculate p([activation in range | region (voxel)] groups = [] for concept_node in concept_nodes: node = get_node_by_name(tree, concept_node) node_id = node["nid"] # for save image node_meta = node["meta"] if node: all_children = get_node_fields(node, "name", []) children_in = [ child for child in all_children if child in files.index ] children_out = [ child for child in files.index if child not in children_in ] if len(children_in) > 0 and len(children_out) > 0: print "Generating group for concept node %s" % (concept_node) group = { "in": files.path.loc[children_in].unique().tolist(), "out": files.path.loc[children_out].unique().tolist(), "range_table": range_table, "meta": node_meta, "nid": node_id, "name": concept_node } groups.append(group) if output_folder != None: pickle.dump( group, open("%s/pbc_group_%s.pkl" % (output_folder, node_id), "wb")) return groups
# We will save a vector of # Images by Concept data frame, our X X = pandas.read_csv(labels_tsv,sep="\t",index_col=0) # Get standard mask, 4mm standard_mask=get_standard_mask(4) # Dictionary to look up image files (4mm) lookup = pickle.load(open(image_lookup,"rb")) concepts = X.columns.tolist() # We will go through each voxel (column) in a data frame of image data image_paths = lookup.values() mr = get_images_df(file_paths=image_paths,mask=standard_mask) image_ids = [int(os.path.basename(x).split(".")[0]) for x in image_paths] mr.index = image_ids # We will go through each voxel (column) in a data frame of image data mr = get_images_df(file_paths=group["in"] + group["out"],mask=standard_mask) image_paths = group["in"] + group["out"] image_ids_in = [int(os.path.basename(x).split(".")[0]) for x in group["in"]] image_ids_out = [int(os.path.basename(x).split(".")[0]) for x in group["out"]] image_ids = image_ids_in + image_ids_out mr.index = image_ids # We will save a data frame of pearson scores (to calculate accuracies later) comparison_dfs = pandas.DataFrame()
# We will save a vector of # Images by Concept data frame, our X X = pandas.read_csv(labels_tsv, sep="\t", index_col=0) # Get standard mask, 4mm standard_mask = get_standard_mask(4) # Dictionary to look up image files (4mm) lookup = pickle.load(open(image_lookup, "rb")) concepts = X.columns.tolist() # We will go through each voxel (column) in a data frame of image data image_paths = lookup.values() mr = get_images_df(file_paths=image_paths, mask=standard_mask) image_ids = [int(os.path.basename(x).split(".")[0]) for x in image_paths] mr.index = image_ids # We will go through each voxel (column) in a data frame of image data mr = get_images_df(file_paths=group["in"] + group["out"], mask=standard_mask) image_paths = group["in"] + group["out"] image_ids_in = [int(os.path.basename(x).split(".")[0]) for x in group["in"]] image_ids_out = [int(os.path.basename(x).split(".")[0]) for x in group["out"]] image_ids = image_ids_in + image_ids_out mr.index = image_ids # We will save a data frame of pearson scores (to calculate accuracies later) comparison_dfs = pandas.DataFrame() for image_pair in image_pairs:
X = pandas.read_csv(labels_tsv,sep="\t",index_col=0) # Dictionary to look up image files (4mm) lookup = pickle.load(open(image_lookup,"rb")) # Get standard mask, 4mm standard_mask=get_standard_mask(4) # We will save data to dictionary result = dict() concepts = X.columns.tolist() # We will go through each voxel (column) in a data frame of image data image_paths = lookup.values() mr = get_images_df(file_paths=image_paths,mask=standard_mask) image_ids = [int(os.path.basename(x).split(".")[0]) for x in image_paths] mr.index = image_ids # what we can do is generate a predicted image for a particular set of concepts (e.g, for a left out image) by simply multiplying the concept vector by the regression parameters at each voxel. then you can do the mitchell trick of asking whether you can accurately classify two left-out images by matching them with the two predicted images. regression_params = pandas.DataFrame(0,index=mr.columns,columns=concepts) print "Training voxels..." for voxel in mr.columns: train = mr.index Y = mr.loc[train,voxel].tolist() Xtrain = X.loc[train,:] # Use regularized regression clf = linear_model.ElasticNet(alpha=0.1) clf.fit(Xtrain,Y)
X = scaled # Dictionary to look up image files (4mm) lookup = pickle.load(open(image_lookup, "rb")) # Get standard mask, 4mm standard_mask = get_standard_mask(4) # We will save data to dictionary result = dict() concepts = X.columns.tolist() # We will go through each voxel (column) in a data frame of image data image_paths = lookup.values() mr = get_images_df(file_paths=image_paths, mask=standard_mask) image_ids = [int(os.path.basename(x).split(".")[0]) for x in image_paths] mr.index = image_ids norm = pandas.DataFrame(columns=mr.columns) # Normalize the image data by number of subjects #V* = V/sqrt(S) for row in mr.iterrows(): subid = row[0] number_of_subjects = image_df.loc[subid].number_of_subjects.tolist() norm_vector = row[1] / numpy.sqrt(number_of_subjects) norm.loc[subid] = norm_vector del mr
def likelihood_groups_from_tree(tree,standard_mask,input_folder,image_pattern="[0]+%s[.]", output_folder=None,node_pattern="[0-9]+",): '''likelihood_groups_from_tree Function to generate likelihood groups from a pybraincompare.ontology.tree object. These groups can then be used to calculate likelihoods (eg, p(activation|cognitive process). The groups are output as pickle objects. This is done because it is ideal to calculate likelihoods on a cluster. :param tree: dict a dictionary of nodes, with base nodes matching a particular pattern assumed to be image (.nii.gz) files. :param standard_mask: nifti image (nibabel) standard image mask that images are registered to :param output_folder: path a folder path to save likelihood groups :param input_folder: path the folder of images to be matched to the nodes of the tree. :param pattern: str the pattern to match to find the base image nodes. Default is a number of any length [neurovault image primary keys]. :param image_pattern: str a regular expression to find image files in images_folder. Default will match any number of leading zeros, any number, and any extension. :param node_pattern: str a regular expression to find image nodes in the tree, matched to name :return groups: pickle a pickle with the following ..note:: pbc_likelihood_groups_trm_12345.pkl group["nid"] = "trm_12345" group["in"] = ["path1","path2",..."pathN"] group["out"] = ["path3","path4",..."pathM"] group["meta"]: meta data for the node group["range_table"]: a data frame of ranges with "start" and "stop" to calculate the range is based on the mins and max of the entire set of images ''' # Find all nodes in the tree, match to images in folder nodes = get_node_fields(tree,field="name",nodes=[]) contender_files = glob("%s/*" %input_folder) # Images will match the specified pattern find_nodes = re.compile(node_pattern) image_nodes = numpy.unique([node for node in nodes if find_nodes.match(node)]).tolist() # Node names must now be matched to files file_lookup = dict() file_names = [os.path.split(path)[-1] for path in contender_files] for node in image_nodes: find_file = re.compile(image_pattern %node) idx = [file_names.index(x) for x in file_names if find_file.match(x)] if len(idx) > 1: raise ValueError("ERROR: found %s images that match pattern %s." %len(idx),find_file.pattern) elif len(idx) == 0: print "Did not find file for %s, will not be included in analysis." %(node) else: file_lookup[node] = contender_files[idx[0]] # Use pandas dataframe to not risk weird dictionary iteration orders files = pandas.DataFrame(file_lookup.values(),columns=["path"]) files.index = file_lookup.keys() # The remaining nodes in the tree (that are not images) will have a RI score concept_nodes = [x for x in nodes if x not in image_nodes] # create table of voxels for all images (the top node) mr = get_images_df(file_paths=files.path,mask=standard_mask) mr.index = files.index range_table = make_range_table(mr) # GROUPS ---------------------------------------------------- # Find groups for image sets at each node (**node names must be unique) # This is images at (and in lower levels) of node vs. everything else # will be used to calculate p([activation in range | region (voxel)] groups = [] for concept_node in concept_nodes: node = get_node_by_name(tree,concept_node) node_id = node["nid"] # for save image node_meta = node["meta"] if node: all_children = get_node_fields(node,"name",[]) children_in = [child for child in all_children if child in files.index] children_out = [child for child in files.index if child not in children_in] if len(children_in) > 0 and len(children_out) > 0: print "Generating group for concept node %s" %(concept_node) group = {"in": files.path.loc[children_in].unique().tolist(), "out": files.path.loc[children_out].unique().tolist(), "range_table": range_table, "meta": node_meta, "nid": node_id, "name": concept_node} groups.append(group) if output_folder != None: pickle.dump(group,open("%s/pbc_group_%s.pkl" %(output_folder,node_id),"wb")) return groups
def get_likelihood_df(nid,in_images,out_images,standard_mask,range_table, threshold=2.96,output_folder=None,method=["binary"]): '''get_likelihood_df will calculate likelihoods and save to a pandas df pickle. The user must specify the method [default is binary]. Method details: ranges: - likelihood in all thresholds defined in image (calculate_priors in ranges) binary - likelihood above / below a certain level [threshold, default=2.96] Note: you do not need to calculate likelihoods in advance for the mean metric (using a derivation of the distance from a mean image as a probability score) In this case, use calculate_reverse_inference_distance :param nid: str a unique identifier, typically a node ID from a pybraincompare.ontology.tree :param in_images: list a list of files for the "in" group relevant to some concept :param out_images: list the rest :param standard_mask: nibabel.Nifti1Image object the standard mask images are in space of :param range_table: pandas data frame a data frame of ranges with "start" and "stop" to calculate the range is based on the mins and max of the entire set of images can be generated with pybraincompare.inference.make_range_table :param output_folder: path folder to save likelihood pickles [default is None] If output_folder is not specified, the df objects are returned. If specified, will return paths to saved pickle objects: pbc_likelihood_trm12345_df_in.pkl EACH VOXEL IS p(activation in voxel is in threshold) ''' # Read all images into one data frame if len(numpy.intersect1d(in_images,out_images)) > 0: raise ValueError("ERROR: in_images and out_images should not share images!") all_images = in_images + out_images mr = get_images_df(file_paths=all_images,mask=standard_mask) mr.index = all_images in_subset = mr.loc[in_images] out_subset = mr.loc[out_images] # Calculate likelihood for user defined methods df = dict() if "ranges" in method: df["out_ranges"] = calculate_likelihood_in_ranges(in_subset,range_table) df["in_ranges"] = calculate_likelihood_in_ranges(out_subset,range_table) if output_folder: df["in_ranges"] = save_likelihood_pickle(df["in_ranges"],output_folder,nid,"in_ranges") df["out_ranges"] = save_likelihood_pickle(df["out_ranges"],output_folder,nid,"out_ranges") if "binary" in method: df["in_bin"] = calculate_likelihood_binary(in_subset,threshold) df["out_bin"] = calculate_likelihood_binary(out_subset,threshold) if output_folder: df["in_bin"] = save_likelihood_pickle(df["in_bin"],output_folder,nid,"in_bin_%s" %threshold) df["in_out"] = save_likelihood_pickle(df["out_bin"],output_folder,nid,"out_bin_%s" %threshold) return df
brain_4mm = get_standard_mask(4) for pmid in df.columns: pmid_mr = df[pmid].tolist() empty_nii = numpy.zeros(dataset.masker.volume.shape) empty_nii[dataset.masker.volume.get_data()!=0] = pmid_mr empty_nii = nibabel.Nifti1Image(empty_nii,affine=dataset.masker.volume.get_affine()) tmpnii = "%s/tmp.nii.gz" %(neurosynth_feature_maps) nibabel.save(empty_nii,tmpnii) # ***Interpolation must be nearest as neurosynth data is binary! nii = resample_img(tmpnii,target_affine=brain_4mm.get_affine(),interpolation="nearest") nibabel.save(nii,"%s/%s.nii.gz" %(neurosynth_feature_maps,pmid)) # Load into image data frame os.remove("%s/tmp.nii.gz"%(neurosynth_feature_maps)) concept_maps_4mm = glob("%s/*.nii.gz"%(neurosynth_feature_maps)) X = get_images_df(file_paths=concept_maps_4mm,mask=brain_4mm) Xindex = [int(x.replace(".nii.gz","").replace(neurosynth_feature_maps,"").replace("/","")) for x in concept_maps_4mm] X.index = Xindex ### ENCODING MODEL ## This is our "features" data frame # X=load_neurosynth_term_mappings() # size nterms X npapers - for each paper, a binary encoding of the presence/absence of each cog atlas term in the abstract # mapping=numpy.zeros(nvoxels,nterms) # neurosynth_map=load_data() # data from all voxels, size novels X npapers, binary encoding of activation presence/absence # Get rid of entry with all zeros (PMID does not have abstract) features=features.drop(9728909,axis=0) X=X.drop(9728909,axis=0)