def read_file_info(self, infname, n_paths, calc_adj_mi): paths = [None for _ in range(n_paths)] with opener('r')(infname) as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['partition'] == '': raise Exception('ERROR null partition (one of the processes probably got passed zero sequences') # shouldn't happen any more FLW uids = [] for cluster in line['partition'].split(';'): uids.append([unique_id for unique_id in cluster.split(':')]) path_index = int(line['path_index']) if paths[path_index] is None: paths[path_index] = ClusterPath(int(line['initial_path_index'])) else: assert paths[path_index].initial_path_index == int(line['initial_path_index']) n_procs = int(line['n_procs']) if 'n_procs' in line else 1 logweight = float(line['logweight']) if 'logweight' in line else None adj_mi = -1 if calc_adj_mi: adj_mi = utils.mutual_information(uids, self.reco_info, debug=False) if self.reco_info is not None else -1 paths[path_index].add_partition(uids, float(line['logprob']), n_procs=n_procs, logweight=logweight, adj_mi=adj_mi) for cp in paths: if cp is None: raise Exception('None type path read from %s' % infname) for ptn in cp.partitions: if len(ptn) == 0: raise Exception('zero length partition read from %s' % infname) return paths
def add_next_global_partition(): global_partition = [] global_logprob = 0. for ifile in range(len(fileinfos)): # combine the first line in each file to make a global partition for cluster in fileinfos[ifile][ipath].partitions[0]: global_partition.append(list(cluster)) global_logprob += fileinfos[ifile][ipath].logprobs[0] global_adj_mi = -1 if calc_adj_mi: global_adj_mi = utils.mutual_information(global_partition, self.reco_info, debug=False) if self.reco_info is not None else -1 self.paths[ipath].add_partition(global_partition, global_logprob, n_procs=len(fileinfos), logweight=0., adj_mi=global_adj_mi) # don't know the logweight yet (or maybe at all!)
def write_partitions(self, writer, is_data, reco_info, true_partition, smc_particles, path_index, n_to_write=None, calc_adj_mi=None): for ipart in self.get_partition_subset(n_partitions=n_to_write): part = self.partitions[ipart] cluster_str = '' bad_clusters = [] # inferred clusters that aren't really all from the same event for ic in range(len(part)): if ic > 0: cluster_str += ';' cluster_str += ':'.join(part[ic]) if not is_data: same_event = utils.from_same_event(is_data, reco_info, part[ic]) # are all the sequences from the same event? entire_cluster = True # ... and if so, are they the entire true cluster? if same_event: reco_id = reco_info[part[ic][0]]['reco_id'] # they've all got the same reco_id then, so pick an aribtrary one true_cluster = true_partition[reco_id] for uid in true_cluster: if uid not in part[ic]: entire_cluster = False break else: entire_cluster = False if not same_event or not entire_cluster: bad_clusters.append(':'.join(part[ic])) if len(bad_clusters) > 25: bad_clusters = ['too', 'long'] row = {'logprob' : self.logprobs[ipart], 'n_clusters' : len(part), 'n_procs' : self.n_procs[ipart], 'clusters' : cluster_str} if smc_particles > 1: row['path_index'] = path_index row['logweight'] = self.logweights[ipart] if not is_data: if calc_adj_mi is None or self.adj_mis[ipart] != -1: # if we don't want to write any adj mis, or if we already calculated it row['adj_mi'] = self.adj_mis[ipart] else: if calc_adj_mi == 'best' and ipart == self.i_best: # only calculate adj_mi for the best partition row['adj_mi'] = utils.mutual_information(part, reco_info) else: row['adj_mi'] = self.adj_mis[ipart] row['n_true_clusters'] = len(true_partition) row['bad_clusters'] = ';'.join(bad_clusters) writer.writerow(row)
def preprocess(collab, work, edu, advs, prods, stop_words=[]): # drop rows with no collaborations data = collab[collab['Colaboracoes'] != 0] # drop work rows with missing # vals and join to running data work = drop_if_missing(work) data = data.join(work, how='inner') # coerce numerical types in edu and # drop rows with missing values, # except post-doc and specialization, # which can be NaN for col in edu.columns: if col in ('inicio', 'inicio.1', 'inicio.2', 'fim', 'fim.1', 'fim.2'): edu[col] = pd.to_numeric(edu[col], errors='coerce') for column in edu.columns: if column != 'pos-doutorado' and column != 'especializacao': edu = edu[~pd.isna(edu[column])] # join to running data data = data.join(edu, how='inner') # join advisees data to running data data = data.join(advs, how='inner') # remove rows with no scientific # production and join to running data prods = prods[(prods != 0).any(axis=1)] data = data.join(prods, how='inner') # since there is high variability in how users # specify places and courses in their CVs, we # cluster them with LSA + K-Means # cluster places places = [col for col in data.columns if 'local' in col ] + ['Instituicao Atual'] data = cluster_text(data, columns=places, n_clusters=3000, stop_words=stop_words) # cluster higher education courses = [ 'doutorado', 'graduacao', 'especializacao', 'mestrado', 'pos-doutorado' ] data = cluster_text(data, columns=courses, n_clusters=500, stop_words=stop_words) # compute collaborations probabilities collab = data['Colaboracoes'] total = len(collab) collab_prob = [np.sum(collab == x) / total for x in np.unique(collab)] # compute mutual information between features # and discard those that are independent from # collaborations all_cols = [] mis = [] for column in sorted(data.columns): if column != 'Colaboracoes': # compute mutual information mi = utils.mutual_information(collab, data[column], X_marginal=collab_prob) all_cols.append(column) mis.append(mi) # discard independent features if np.isclose(mi, 0): data = data.drop(columns=column) return data, mis, all_cols