def redis_set_helper(key, data, pipe, npz = False): with BytesIO() as b: if npz: save_npz(b, data) else: np.save(b, data) pipe.set(key, b.getvalue())
def _save_and_load(matrix): fd, tmpfile = tempfile.mkstemp(suffix='.npz') os.close(fd) try: save_npz(tmpfile, matrix) loaded_matrix = load_npz(tmpfile) finally: os.remove(tmpfile) return loaded_matrix
def transferG2ADJ(): G = json_graph.node_link_graph(json.load(open("reddit/reddit-G.json"))) feat_id_map = json.load(open("reddit/reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} numNode = len(feat_id_map) adj = np.zeros((numNode, numNode)) newEdges0 = [feat_id_map[edge[0]] for edge in G.edges()] newEdges1 = [feat_id_map[edge[1]] for edge in G.edges()] # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 adj = sp.csr_matrix((np.ones((len(newEdges0),)), (newEdges0, newEdges1)), shape=(numNode, numNode)) sp.save_npz("reddit_adj.npz", adj)
def main(): args, kwargs = get_args() result = preprocess(args.data, format = args.format, kwargs = kwargs, col_order = args.col_order, k_cores = args.k_cores, save_map = args.save_map, output = args.output, user_map = args.user_map, item_map = args.item_map, train_size = args.train_size, dtype = args.dtype, debug=args.debug, timestamp = args.timestamp) if args.save_map: json_dump(result['user_map'], os.path.join(args.output, args.user_map)) json_dump(result['item_map'], os.path.join(args.output, args.item_map)) if args.timestamp: save_npz(os.path.join(args.output, args.timestamp_file), result['timestamp']) save_npz(os.path.join(args.output, args.train_file), result['train']) save_npz(os.path.join(args.output, args.test_file), result['test'])
def save_similarity(self, name_file, compressed=False): sps.save_npz(name_file, self.s, compressed)
for i, w in enumerate(line_as_idx): # keep count k += 1 if k % 10000 == 0: print("%s/%s" % (k, num_tokens)) start = max(0, i - context_size) end = min(len(line_as_idx), i + context_size) for c in line_as_idx[start:i]: wc_counts[w, c] += 1 for c in line_as_idx[i+1:end]: wc_counts[w, c] += 1 print("Finished counting") save_npz('pmi_counts_%s.npz' % V, csr_matrix(wc_counts)) else: wc_counts = load_npz('pmi_counts_%s.npz' % V) # context counts get raised ^ 0.75 c_counts = wc_counts.sum(axis=0).A.flatten() ** 0.75 c_probs = c_counts / c_counts.sum() c_probs = c_probs.reshape(1, V) # PMI(w, c) = #(w, c) / #(w) / p(c) # pmi = wc_counts / wc_counts.sum(axis=1) / c_probs # works only if numpy arrays pmi = wc_counts.multiply(1.0 / wc_counts.sum(axis=1) / c_probs).tocsr() # this operation changes it to a coo_matrix
def create_local_clustering(self, overwrite, r_thresh, min_region_size=80): """ API for performing any of a variety of clustering routines available through NiLearn. """ import os.path as op from scipy.sparse import save_npz, load_npz from nilearn.regions import connected_regions try: conn_comps = connected_regions( self._clust_mask_corr_img, extract_type="connected_components", min_region_size=min_region_size, ) self._conn_comps = conn_comps[0] self.num_conn_comps = len(conn_comps[1]) except BaseException: try: raise ValueError("Clustering mask is empty!") except ValueError: import sys sys.exit(1) if not self._conn_comps: if np.sum(np.asarray(self._clust_mask_corr_img.dataobj)) == 0: try: raise ValueError("Clustering mask is empty!") except ValueError: import sys sys.exit(1) else: self._conn_comps = self._clust_mask_corr_img self.num_conn_comps = 1 print(f"Detected {self.num_conn_comps} connected components in " f"clustering mask with a mininimum region " f"size of {min_region_size}") if (self.clust_type == "complete" or self.clust_type == "average" or self.clust_type == "single"): if self.num_conn_comps > 1: try: raise ValueError( "Clustering method unstable with spatial constrainsts " "applied to multiple connected components.") except ValueError: import sys sys.exit(0) if (self.clust_type == "ward" and self.num_conn_comps > 1) or self.clust_type == "ncut": if self.k < self.num_conn_comps: try: raise ValueError( "k must minimally be greater than the total number of " "connected components in " "the mask in the case of agglomerative clustering.") except ValueError: import sys sys.exit(0) if self.local_corr == "tcorr" or self.local_corr == "scorr": self._local_conn_mat_path = (f"{self.uatlas.split('.nii')[0]}_" f"{self.local_corr}_conn.npz") if (not op.isfile( self._local_conn_mat_path)) or (overwrite is True): from pynets.fmri.clustools import ( make_local_connectivity_tcorr, make_local_connectivity_scorr, ) if self.local_corr == "tcorr": self._local_conn = make_local_connectivity_tcorr( self._func_img, self._clust_mask_corr_img, thresh=r_thresh) elif self.local_corr == "scorr": self._local_conn = make_local_connectivity_scorr( self._func_img, self._clust_mask_corr_img, thresh=r_thresh) else: try: raise ValueError( "Local connectivity type not available") except ValueError: import sys sys.exit(0) print( f"Saving spatially constrained connectivity structure" f" to: {self._local_conn_mat_path}") save_npz(self._local_conn_mat_path, self._local_conn) elif op.isfile(self._local_conn_mat_path): self._local_conn = load_npz(self._local_conn_mat_path) elif self.local_corr == "allcorr": if self.clust_type == "ncut": try: raise ValueError( "Must select either `tcorr` or `scorr` local " "connectivity option if you are using " "`ncut` clustering method") except ValueError: import sys sys.exit(0) self._local_conn = "auto" else: try: raise ValueError( "Local connectivity method not recognized. Only tcorr," " scorr, and auto are currently " "supported") except ValueError: import sys sys.exit(0) else: self._local_conn = "auto" return
def save(self): if self.save_file is not None: if self.sparse: ss.save_npz(self.save_file, self.matrix) else: self.matrix.tofile(self.save_file)
# elif item == 'neutral': # l.append(0.0) # lab_test=l #print type(fvec_test) #print 'convert test to sparse' #_test_arr = csr_matrix(fvec_test) #print X_test_arr.data.nbytes #sparse.save_npz("test/testmatrix.npz", X_test_arr) #X_test_arr = sparse.load_npz("test/testmatrix.npz") #print 'converted' print 'convert train to sparse' X_train_arr = csr_matrix(fvec_train) print X_train_arr.data.nbytes sparse.save_npz("train/trainmatrix.npz", X_train_arr) #X_train_arr = sparse.load_npz("train/trainmatrix.npz") #load saved sparse matrix print 'converted' ''' #train_valid_test split fvec_train, fvec_tes, lab_train, lab_te = train_test_split(X_train_arr, lab_train, test_size=0.0, random_state=1) fvec_test, o, lab_test, o = train_test_split(X_test_arr, lab_test, test_size=0.0, random_state=1) #convert to np arrays X_train_arr=np.array(fvec_train).astype(float) X_test_arr=np.array(fvec_test).astype(float) y_train_arr = np.array(lab_train).astype(float) y_test_arr = np.array(lab_test).astype(float)
#exec(open("mkmatslowly.py").read()) import numpy as np import scipy.sparse as ss import json # (from args) get start and end indices index = 0 start = 0 end = 1 f = open("names.txt") trk = [line[:-1] for line in f] f.close() mat = ss.csr_matrix((0,len(trk)), dtype=np.int8) slc = start * 1000 for i in range(start, end): f = open(f"mpd.slice.{slc}-{slc+999}.json") j = json.load(f) f.close() m = np.zeros(shape=(1000,len(trk)), dtype=np.int8) for row in range(1000): pl = j["playlists"][row] t = (x["track_uri"] for x in pl["tracks"]) m[row] = [int(x in t) for x in trk] slc += 1000 mat = ss.vstack((mat, ss.csr_matrix(m))) ss.save_npz("mat%02d.npz" % index, mat)
def save_proximity(ds, radius, A): logger.info("Saving proximity matrix...") fname = os.path.join(ds.a.data_path, "proximity_radius_%s_%s.npz" %(str(radius), ds.a.brain_mask)) save_npz(fname, A.tocoo()) return
def load_net_ann_datasets(out_dir, taxon, dataset, input_settings, alg_settings, uniprot_taxon_file, **kwargs): sparse_net_file = "%s/%s-net.npz" % (out_dir, taxon) node2idx_file = sparse_net_file + "-node-ids.txt" swsn_weights_file = sparse_net_file + "-swsn-weights.txt" sparse_ann_file = "%s/ann.npz" % (out_dir) if not kwargs.get('forcenet') and \ (os.path.isfile(sparse_net_file) and os.path.isfile(node2idx_file)) and \ os.path.isfile(sparse_ann_file): print("Reading network from %s" % (sparse_net_file)) W = sp.load_npz(sparse_net_file) print("\t%d nodes and %d edges" % (W.shape[0], len(W.data) / 2)) print("Reading node names from %s" % (node2idx_file)) prots = utils.readItemList(node2idx_file, 1) new_net_obj = setup.Sparse_Networks(W, prots) if os.path.isfile(swsn_weights_file): print("Reading swsn weights file %s" % (swsn_weights_file)) weights = [ float(w) for w in utils.readItemList(swsn_weights_file, 1) ] # also load the original networks to get the edge weights for the STRING networks net_obj = run_eval_algs.setup_net(input_settings['input_dir'], dataset, **kwargs) net_obj.swsn_weights = weights else: net_obj = new_net_obj print("\nReading annotation matrix from %s" % (sparse_ann_file)) loaded_data = np.load(sparse_ann_file, allow_pickle=True) dag_matrix = setup.make_csr_from_components(loaded_data['arr_0']) ann_matrix = setup.make_csr_from_components(loaded_data['arr_1']) goids, prots = loaded_data['arr_2'], loaded_data['arr_3'] ann_obj = setup.Sparse_Annotations(dag_matrix, ann_matrix, goids, prots) species_to_uniprot_idx = eval_loso.get_uniprot_species( uniprot_taxon_file, ann_obj) # TODO eval ann obj eval_ann_obj = None else: # load the network # TODO if a subset of the network was run, need to get that subset net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset( dataset, input_settings['input_dir'], alg_settings, **kwargs) species_to_uniprot_idx = eval_loso.get_uniprot_species( uniprot_taxon_file, ann_obj) new_net_obj = net_obj # run SWSN if needd #if net_obj.multi_net: # TODO if LOSO was run, need to leave out the taxon for edge weights to be accurate if taxon is not None: if kwargs.get('limit_to_taxons_file'): # limit the network to the specified species # read in the specified taxons from the file _, net_taxons = eval_loso.get_selected_species( species_to_uniprot_idx, kwargs['limit_to_taxons_file']) net_taxon_prots = net_exp.get_taxon_prots( net_obj.nodes, net_taxons, species_to_uniprot_idx) net_obj, ann_obj = net_exp.limit_to_taxons(net_taxon_prots, net_obj=net_obj, ann_obj=ann_obj, **kwargs) # leave out the annotations for this taxon ID train_ann_mat, test_ann_mat, sp_goterms = eval_loso.leave_out_taxon( taxon, ann_obj, species_to_uniprot_idx, eval_ann_obj=eval_ann_obj, **kwargs) taxon_prots = net_exp.get_taxon_prots(net_obj.nodes, [taxon], species_to_uniprot_idx) new_net_obj = net_exp.limit_net_to_target_taxon( train_ann_mat, taxon_prots, net_obj, ann_obj, **kwargs) W = new_net_obj.W # else: # W, _ = net_obj.weight_SWSN(ann_obj.ann_matrix) # #new_net_obj = else: W = net_obj.W print("\twriting sparse matrix to %s" % (sparse_net_file)) sp.save_npz(sparse_net_file, W) print("\twriting node2idx labels to %s" % (node2idx_file)) with open(node2idx_file, 'w') as out: out.write(''.join([ "%s\t%d\n" % (prot, i) for i, prot in enumerate(net_obj.nodes) ])) if net_obj.multi_net: print("\twriting swsn weights file to %s" % (swsn_weights_file)) with open(swsn_weights_file, 'w') as out: out.write('\n'.join([str(w) for w in new_net_obj.swsn_weights]) + '\n') net_obj.swsn_weights = new_net_obj.swsn_weights # now store them to a file print("\twriting sparse annotations to %s" % (sparse_ann_file)) # store all the data in the same file dag_matrix_data = setup.get_csr_components(ann_obj.dag_matrix) ann_matrix_data = setup.get_csr_components(ann_obj.ann_matrix) #np.savez_compressed( # sparse_ann_file, dag_matrix_data=dag_matrix_data, # ann_matrix_data=ann_matrix_data, goids=goids, prots=prots) np.savez_compressed(sparse_ann_file, dag_matrix_data, ann_matrix_data, ann_obj.goids, ann_obj.prots) return net_obj, new_net_obj, ann_obj, eval_ann_obj, species_to_uniprot_idx
def test_construct_sparse_matrix(self): """Construct a sparse matrices of the horse racing dataset. What will be saved after running this suite ------------------------------------------- """ df = pd.read_csv(PATH2FEATURES) df.astype({GROUP_KEY: str}) unique_hnames = df['hname'].unique() hname2ind = pd.get_dummies(unique_hnames) invalid_rids = df[df['odds'] == 0][GROUP_KEY].unique() context_cols = ['n_presi', 'n_avgsi4', 'n_disavgsi', 'n_goavgsi', 'w2c', 'eps', 'draw', 'newdis', 'jnowin', 'jwinper', 'jst1miss'] # [START Construct Competitor & Entity Index Vector] n_horses = len(unique_hnames) indexing_features = sparse.coo_matrix((0, 2 * n_horses), dtype=np.int8) grouped = df[~df[GROUP_KEY].isin(invalid_rids)].groupby(GROUP_KEY) pbar = tqdm(total=len(grouped)) pbar.set_description('Constructing Indexing Matrix...') for idx, (_, rdata) in enumerate(grouped): entries = rdata['hname'] n_entries = len(entries) val = np.ones(np.power(n_entries, 2)) # shape = (n_entries ** 2, ) row = np.array([np.repeat(i, n_entries) for i in np.arange(0, n_entries)]).flatten() # shape = (n_entries ** 2, ) entry_indices = np.array([hname2ind[hname2ind[hname] == 1].index[0] for hname in entries]) # [START Obtain Column Indices] col = [] for jdx, entry_index in enumerate(entry_indices): _copy = entry_indices.copy() combination_indices = np.delete(entry_indices, jdx) entity_index = entry_index + n_horses # entity & entry this_col = np.append(combination_indices, entity_index) col += this_col.tolist() col = np.array(col) # [END Obtain Column Indices] index_matrix = sparse.coo_matrix((val, (row, col)), shape=(n_entries, 2 * n_horses), dtype=np.int8) # [START Assertion] if idx == len(grouped): for hname in entries: nonzeros = index_matrix.toarray().nonzero() target_ind = hname2ind[hname2ind[hname] == 1].index[0] _index = nonzeros[0][n_entries - 1] ind_as_entity = index_matrix.toarray().nonzero()[1][_index] self.assertEqual(target_ind, ind_as_entity - n_horses) # [END Assertion] indexing_features = sparse.vstack([indexing_features, index_matrix]) # TODO the greater idx is, the slower... pbar.update(1) pbar.close() # [END Construct Competitor & Entity Index Vector] # [START Construct Context Vector & Target] n_train_rows = 0 # ← MAX Train data index n_contexts = len(context_cols) context_features = sparse.coo_matrix((0, n_contexts)) target_series = [] raceid_series = [] pbar2 = tqdm(total=len(grouped)) pbar2.set_description('Constructing Context Matrix ...') for idx, (_, rdata) in enumerate(grouped): context_matrix = sparse.coo_matrix(rdata[context_cols].values) context_features = sparse.vstack([context_features, context_matrix]) target_series += rdata[TARGET_KEY].values.tolist() raceid_series += rdata[GROUP_KEY].values.tolist() # [START Get Train Test Split Index] if idx == np.round(TRAIN_SIZE * len(grouped)): n_train_rows, _ = context_features.shape # [END Get Train Test Split Index] pbar2.update(1) pbar2.close() # [END Construct Context Vector & Target] # Finally, concat indexing_features & context_features features = sparse.hstack((indexing_features, context_features)) target_series = np.asarray(target_series) # Display Stats print('---' * 20) print('Summary') print(f'+ The number of races: {len(grouped)}') print(f'+ The number of horses: {n_horses}') print('Indexing Matrix Stats') print(f'+ Shape of sparse matrix: {indexing_features.shape}') print(f'+ The number of nonzero elems: {indexing_features.nnz}') print('Context Matrix Stats') print(f'Shape of dense matrix: {context_features.shape}') print('Train Test Split') print(f'+ Maximum Train data row index: {n_train_rows}') print('---' * 20) # Unit Testing self.assertEqual(indexing_features.dtype, np.int8) self.assertEqual(indexing_features.shape[0], context_features.shape[0]) self.assertEqual(features.shape[1], indexing_features.shape[1] + context_features.shape[1]) self.assertEqual(features.shape[0], len(target_series)) self.assertEqual(len(target_series), len(raceid_series)) # Save the features & targets sparse.save_npz(FEATURES_OUTPUT, features) np.save(TARGETS_OUTPUT, target_series) with open(RACEIDS_OUTPUT, mode='wb') as fp: pickle.dump(raceid_series, fp)
def chromosome_coverage_read_counts(self, gene_overlap_dat, chrom_gene_df, chrom_exon_df, chrom): """ Determine per-chromosome reads coverage and per-gene read counts from an RNA-seq experiment in a way that properly considers ambiguous reads - if a (paired) read falls entirely within the exonic regions of a *single* gene, only then does read contribute to read count and coverage. The cigar scores from single and paired reads are parsed according to cigar_segment_bounds. 1. Saves compressed coverage array to self.save_dir with file name 'sample_[sample_id]_[chrom].npz' for genes with no overlap with any other gene (a.k.a. "isolated genes") with filename 'chrom_coverage_[sample_id]_[chrom].npz' 2. Saves a dictionary of {gene_name: 1-d numpy gene coverage arrays (concatenated exonic regions)} to a serialized pickle file for all genes that exonic have overlap with other genes (a.k.a. "overlap genes") with filename 'overlap_coverage_[sample_id]_[chrom].pkl' 3. Saves read counts to self.save_dir with filename 'read_counts_[sample_id]_[chrom].csv' NOTE: if the required chromosome coverage files and read count file *already* exist prior to any coverage/read count calculations, Degnorm will default to using those files. This will only happen if a user either moves coverage and read count files from a prior Degnorm pipeline run to the appropriate chromosome directories of the target output directory, or if they re-use a Degnorm pipeline run's output directory. This is *NOT* the same as using a warm-start directory. A warm-start skips coverage/read count calculations entirely, assuming a prior Degnorm run successfully parse all coverage/read counts. :param chrom_gene_df: pandas.DataFrame with `chr`, `gene`, `gene_start`, and `gene_end` columns that delineate the start and end position of a gene's transcript on a chromosome, must be subset to the chromosome in study. :param gene_overlap_dat: dictionary with keys 'isolated_genes' and 'overlap_genes' detailing groups of genes that do not overlap with others and then groups of genes that share any overlap. See gene_processing.get_gene_overlap_structure function. :param chrom_exon_df: pandas.DataFrame with `chr`, `gene`, `start`, `end` columns that delineate the start and end positions of exons on a gene. :param chrom: str chromosome name :return: None. Coverage and read count files are written to self.save_dir. """ # First, load this chromosome's reads. if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- begin loading reads from {2}'.format( self.sample_id, chrom, self.filename)) # assess how many genes we have. n_genes = chrom_gene_df.shape[0] # gene_overlap_dat data check: ensure that number isolated genes + number overlapping genes # equals number of genes in genes DataFrame. n_isolated_genes, n_overlap_genes = 0, 0 if gene_overlap_dat['isolated_genes']: n_isolated_genes = len(gene_overlap_dat['isolated_genes']) if gene_overlap_dat['overlap_genes']: n_overlap_genes = np.sum( [len(x) for x in gene_overlap_dat['overlap_genes']]) if n_isolated_genes + n_overlap_genes != n_genes: raise ValueError( 'number of genes contained in gene_overlap_dat does not match that of chrom_gene_df.' ) # create filepaths to non-overlapping read coverage, overlapping read coverage, read count files. chrom_cov_file = os.path.join( self.save_dir, 'chrom_coverage_' + self.sample_id + '_' + str(chrom) + '.npz') ol_cov_file = os.path.join( self.save_dir, 'overlap_coverage_' + self.sample_id + '_' + str(chrom) + '.pkl') count_file = os.path.join( self.save_dir, 'read_counts_' + self.sample_id + '_' + str(chrom) + '.csv') # if all required coverage, read count files are present, e.g. created from a previous run attempt, # then skip all calculations and default to the existing files. Addresses issue #30. if ((n_isolated_genes > 0 and os.path.isfile(chrom_cov_file)) or n_isolated_genes == 0) \ and ((n_overlap_genes > 0 and os.path.isfile(ol_cov_file)) or n_overlap_genes == 0) \ and (os.path.isfile(count_file)): if self.verbose: logging.info("""SAMPLE {0}, CHR {1} -- WARNING... All coverage and read count files already present: {0} {1} {2} Defaulting to these files; skipping coverage and read count calculations."""\ .format(chrom_cov_file, ol_cov_file, count_file)) return None # initialize read counts. read_count_dict = {gene: 0 for gene in chrom_gene_df.gene} # set pandas.options.mode.chained_assignment = None to avoid SettingWithCopyWarnings set_option('mode.chained_assignment', None) # ---------------------------------------------------------------------- # # Step 1. Load chromosome's reads and index them. # ---------------------------------------------------------------------- # reads_df = self.load_chromosome_reads(chrom) if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- reads successfully loaded. shape = {2}' .format(self.sample_id, chrom, reads_df.shape)) # append end position to reads based on cigar score. reads_df['end_pos'] = reads_df['pos'] + reads_df['cigar'].apply( lambda x: sum([int(k) for k, v in re.findall(r'(\d+)([A-Z]?)', x)])) # assign row number to read ID column. reads_df['read_id'] = range(reads_df.shape[0]) # easy win: drop reads whose start position is < minimum start position of a gene, # and drop reads whose end position is > maximum start position of a gene min_gene_start, max_gene_end = chrom_gene_df.gene_start.min( ) - 1, chrom_gene_df.gene_end.max() - 1 reads_df = reads_df[(reads_df.pos >= (min_gene_start)) & (reads_df.end_pos <= (max_gene_end))] # If working with paired reads, # ensure that we've sequestered paired reads (eliminate any query names only occurring once). if self.paired: qname_counts = reads_df.qname_unpaired.value_counts() paired_occ_reads = qname_counts[qname_counts == 2].index.values.tolist() reads_df = reads_df[reads_df.qname_unpaired.isin(paired_occ_reads)] # ---------------------------------------------------------------------- # # Step 2. Drop reads that don't fully fall within union of all exons. # ---------------------------------------------------------------------- # chrom_len = self.header[self.header.chr == chrom].length.iloc[0] tscript_vec = np.ones( [chrom_len], dtype=int) # large vector, will delete after using. # build binary 0/1 exon/intron indicator vector. # Need to account for exon data being 1-indexed, tscript_vec is 0-indexed, but # exon end positions are inclusive. exon_starts = chrom_exon_df.start.values - 1 exon_ends = chrom_exon_df.end.values for i in range(len(exon_starts)): tscript_vec[exon_starts[i]:exon_ends[i]] = 0 del exon_starts, exon_ends gc.collect() # store read_ids of reads to drop, and initialize dropped read count. drop_reads = list() # store read match region bounds, so that we only parse CIGAR strings once. read_bounds = list() # use values array, faster access. dat = reads_df[['cigar', 'pos', 'read_id']].values # for paired reads, perform special parsing of CIGAR strings to avoid double-counting of overlap regions. if self.paired: for ii in np.arange(1, dat.shape[0], 2): # obtain read region bounds. bounds_1 = cigar_segment_bounds(dat[ii - 1, 0], start=dat[ii - 1, 1]) bounds_2 = cigar_segment_bounds(dat[ii, 0], start=dat[ii, 1]) # leverage nature of alignments of paired reads to find disjoint coverage ranges. min_bounds_1, max_bounds_1 = min(bounds_1), max(bounds_1) min_bounds_2, max_bounds_2 = min(bounds_2), max(bounds_2) if max_bounds_2 >= max_bounds_1: bounds_2 = [ max_bounds_1 + 1 if j <= max_bounds_1 else j for j in bounds_2 ] else: bounds_2 = [ min_bounds_1 - 1 if j >= min_bounds_1 else j for j in bounds_2 ] bounds_2.sort() # aggregate read pair's bounds. bounds = bounds_1 + bounds_2 # iterate over match regions. If a single region is not fully contained # within exon regions, drop the pair. drop_read = False for j in np.arange(1, len(bounds), step=2): # check whether matching regions on tscript_vec are fully contained within exonic regions. # note that right-bounds are inclusive. if np.sum( tscript_vec[(bounds[j - 1]):(bounds[j] + 1)]) > 0: drop_read = True # append read id to set of read indices to drop (if appropriate). if drop_read: drop_reads.extend([dat[ii - 1, 2], dat[ii, 2]]) # otherwise, append match region bounds list. Note: endpoints of regions are inclusive. else: read_bounds.append(bounds) # for single-read RNA-Seq experiments, we do not need such special consideration. else: for ii in np.arange(dat.shape[0]): # obtain read regions bounds. bounds = cigar_segment_bounds(dat[ii, 0], start=dat[ii, 1]) # iterate over match regions. If a single region is not fully contained # within exon regions, drop the read. drop_read = False for j in np.arange(1, len(bounds), step=2): if np.sum( tscript_vec[(bounds[j - 1]):(bounds[j] + 1)]) > 0: drop_read = True # append read id to set of read indices to drop (if appropriate). if drop_read: drop_reads.append(dat[ii, 2]) # otherwise, append match region bounds list. Note: endpoints of regions are inclusive. else: read_bounds.append(bounds) # drop reads that don't fully intersect exonic regions. if drop_reads: reads_df = reads_df[~reads_df.read_id.isin(drop_reads)] if self.paired: # if paired reads, don't actually need .1 and .2 constituent reads anymore. # So to save time + memory, take every other read. reads_df = reads_df.iloc[np.arange(1, reads_df.shape[0], step=2)] # add parsed match region bounds to reads! reads_df['bounds'] = read_bounds # delete objs, attempt to save on memory. del tscript_vec, drop_reads, dat, read_bounds gc.collect() # ---------------------------------------------------------------------- # # Step 3. Compute coverage, reads across groups of mutually overlapping genes. # (This is costly from a time perspective. Should constitute # coverage, read count calculations for ~ 10-20% of genes.) # ---------------------------------------------------------------------- # # display summary statistics around rate of gene intersection. if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- overlap genes = {2} / {3}.'.format( self.sample_id, chrom, n_overlap_genes, n_genes)) logging.info( 'SAMPLE {0}, CHR {1} -- begin overlap gene group reads processing.' .format(self.sample_id, chrom)) # for genes in a group of overlapping genes, compute read coverage + count. if n_overlap_genes > 0: ol_cov_dict = dict() # iterate over groups of overlapping genes. for ol_genes in gene_overlap_dat['overlap_genes']: ol_gene_df = chrom_gene_df[chrom_gene_df.gene.isin(ol_genes)] ol_gene_group_start = ol_gene_df.gene_start.min() - 1 ol_gene_group_end = ol_gene_df.gene_end.max() - 1 ol_gene_starts = list() gene_exon_bounds = list() transcript_idx = list( ) # list of 1-d np.arrays, each holding one overlapping gene's exon positioning. # obtain exon regions for each gene in overlap group. # Exon starts/ends are 1-indexed, change them to be 0-indexed. for ol_gene in ol_genes: ol_gene_exon_df = chrom_exon_df[chrom_exon_df.gene == ol_gene] # store gene starts for constructing per-gene coverage vectors. # 0-index gene starts/ends. ol_gene_start = ol_gene_exon_df.gene_start.iloc[0] - 1 ol_gene_end = ol_gene_exon_df.gene_end.iloc[0] - 1 ol_gene_starts.append(ol_gene_start) # initialize gene coverage vector for each gene in overlap group. ol_cov_dict[ol_gene] = np.zeros( [ol_gene_end - ol_gene_start + 1], dtype=int) # save gene exon positioning, for determining which reads captured by which genes. # 0-index exon positions, and include gene end positioning. e_starts, e_ends = np.sort( ol_gene_exon_df.start.values) - 1, np.sort( ol_gene_exon_df.end.values) gene_exon_bounds += [[ [e_starts[j], e_ends[j]] for j in range(len(e_starts)) ]] # list of list of lists, includes exon end pos. transcript_idx.append( np.unique( fill_in_bounds(flatten_2d(gene_exon_bounds[-1]))) ) # transcript vector is 0-indexed, includes exon end pos. # drop things we don't need any more. del ol_gene_df, ol_gene_exon_df, e_starts, e_ends # storage for reads to drop. drop_reads = list() # subset reads to those that start and end within scope of this bloc of overlapping genes. ol_reads_dat = reads_df[(reads_df.pos >= (ol_gene_group_start)) & (reads_df.end_pos <= (ol_gene_group_end))][[ 'bounds', 'read_id' ]].values # for single-read RNA-Seq experiments, we do not need such special consideration. for i in range(ol_reads_dat.shape[0]): # obtain read regions bounds. read_bounds, read_id = ol_reads_dat[i, :] # find genes that fully include this read. Everything is 0-indexed. caught_genes = self.determine_full_inclusion( read_bounds, gene_exon_bounds=gene_exon_bounds) # Ambiguous read determination logic: # - if paired reads lie fully within 0 or 2+ genes, do not use the reads pair and drop them. # - if read lies fully within a single gene: # - do not drop it. # - if the caught gene is the current gene being analyzed, use the read. O/w do not. n_caught_genes = len(caught_genes) # if only one gene captures read, use the read and identify capturing gene for # incrementing count, but drop it from consideration later (it's been accounted for). # if only full intersection is with with a single gene, increment coverage and read count # for that gene, and drop read. # Note: need to restart coverage calculations relative to gene's start position. if n_caught_genes == 1: drop_read = True read_gene = ol_genes[caught_genes[0]] read_gene_start = ol_gene_starts[caught_genes[0]] read_idx = fill_in_bounds( read_bounds, endpoint=True) - read_gene_start - 1 ol_cov_dict[read_gene][read_idx] += 1 read_count_dict[read_gene] += 1 # if no gene fully captures the read, do not use read *but do not drop it*, # for the possibility that some isolated gene captures the read later on. elif n_caught_genes == 0: drop_read = False # if > 1 gene fully captures the read, # do not use read and drop it from consideration. else: drop_read = True # if need be, add read to list of reads to be dropped. if drop_read: drop_reads.append(read_id) # drop ambiguous reads from larger set of chromosome reads, # should speed up gene-read searches in the future. if drop_reads: reads_df = reads_df[~reads_df.read_id.isin(drop_reads)] del drop_reads # pare down coverage vectors for genes in overlap group to their concatenated exon regions. for i in range(len(ol_genes)): ol_gene = ol_genes[i] ol_cov_dict[ol_gene] = ol_cov_dict[ol_gene][ transcript_idx[i] - ol_gene_starts[i]] # ---------------------------------------------------------------------- # # Step 3.5: save overlapping genes' coverage vectors. # overlapping gene coverage vector dict ->> pkl file. # ---------------------------------------------------------------------- # if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- saving overlapping gene coverage vectors.' .format(self.sample_id, chrom)) # dump overlapping genes' coverage matrices. with open(ol_cov_file, 'wb') as f: pkl.dump(ol_cov_dict, f) # free up some memory -- delete groups of intersecting genes, etc. del ol_reads_dat, ol_cov_dict, transcript_idx, gene_exon_bounds gc.collect() if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- overlapping gene reads processing successful.' .format(self.sample_id, chrom)) # ---------------------------------------------------------------------- # # Step 4. Compute coverage, reads for individual isolated genes. # ---------------------------------------------------------------------- # if n_isolated_genes > 0: if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- begin isolated gene reads processing.' .format(self.sample_id, chrom)) # reduce chrom_gene_df to remaining genes chrom_gene_df = chrom_gene_df[chrom_gene_df.gene.isin( gene_overlap_dat['isolated_genes'])] # run same inclusion/exclusion transcript test but on the isolated genes. tscript_vec = np.ones([chrom_len], dtype=int) # identify regions of chromosome covered by isolated genes. # change gene starts/ends to 0-indexed to match 0-indexed tscript_vec array, but # gene ends are inclusive. gene_starts = chrom_gene_df.gene_start.values - 1 gene_ends = chrom_gene_df.gene_end.values for i in range(len(gene_starts)): tscript_vec[gene_starts[i]:gene_ends[i]] = 0 # identify reads that do not fall within an isolated gene's (start, end). drop_reads = list() dat = reads_df[['pos', 'end_pos', 'read_id']].values for i in range(dat.shape[0]): read_start, read_end, read_id = dat[i, :] # remember to include read end position. reads are 0-indexed. if np.sum(tscript_vec[read_start:(read_end + 1)]) > 0: drop_reads.append(read_id) # drop memory hogs. del dat, gene_starts, gene_ends, tscript_vec # drop reads that do not lie completely within area covered by isolated genes. if drop_reads: reads_df = reads_df[~reads_df.read_id.isin(drop_reads)] del drop_reads gc.collect() # (a precaution) only continue if we have any reads intersecting isolated genes. if not reads_df.empty: # initialize chromosome coverage array. cov_vec = np.zeros([chrom_len], dtype=int) # ---------------------------------------------------------------------- # # Step 4.5.1: join genes on reads data # so that each read is tied to a gene, for read counting purposes. # ---------------------------------------------------------------------- # # 0-index gene_starts, gene_ends because reads are 0-indexed. chrom_gene_df.loc[:, ['gene_start', 'gene_end']] -= 1 # add IntervalIndex index to chromosome gene data. chrom_gene_df.index = IntervalIndex.from_arrays( chrom_gene_df.gene_start, right=chrom_gene_df.gene_end, closed='both') try: reads_df['gene'] = chrom_gene_df.loc[ reads_df.pos].gene.values # if there remains at least one read that doesn't land within a gene span, # try another sweep to remove reads not within gene regions. except KeyError: # outline valid read start positions along transcript. tscript_vec = np.ones([chrom_len], dtype=int) for i in range(chrom_gene_df.shape[0]): left = chrom_gene_df.index[i].left right = chrom_gene_df.index[i].right + 1 tscript_vec[left:right] = 0 # iterate over reads, checking whether read start position falls within # a [gene_start, gene_end] region. drop_reads = list() for i in range(reads_df.shape[0]): if tscript_vec[reads_df.pos.iloc[i]] != 0: drop_reads.append(reads_df.read_id.iloc[i]) # drop reads that do not start within valid [gene_start, gene_end] regions. if drop_reads: reads_df = reads_df[~reads_df.read_id.isin(drop_reads)] del tscript_vec, drop_reads gc.collect() # subset reads to reads w/ valid read ID, then join with interval index again. reads_df['gene'] = chrom_gene_df.loc[ reads_df.pos].gene.values # loop over reads for isolated genes, incrementing read count and coverage. dat = reads_df[['bounds', 'gene']].values for i in range(dat.shape[0]): bounds, gene = dat[i, :] # reads are already 0-indexed. read_idx = fill_in_bounds(bounds, endpoint=True) # increment coverage and read count. cov_vec[read_idx] += 1 read_count_dict[gene] += 1 # ---------------------------------------------------------------------- # # Step 4.5.2: save chromosome coverage vector. # chromosome overage vector ->> compressed csr numpy array # ---------------------------------------------------------------------- # if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- saving csr-compressed chrom coverage array.' .format(self.sample_id, chrom)) # save coverage vector as a compressed-sparse row matrix. sparse.save_npz(chrom_cov_file, matrix=sparse.csr_matrix(cov_vec)) # drop large data objects. del cov_vec, dat, reads_df # drop remaining large data data objects. del chrom_gene_df, chrom_exon_df gc.collect() if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- isolated gene reads processing successful.' .format(self.sample_id, chrom)) # ---------------------------------------------------------------------- # # Step 5. Save read counts. # chromosome read counts ->> .csv file # ---------------------------------------------------------------------- # # construct read count DataFrame from read count dictionary. read_count_df = DataFrame({ 'gene': list(read_count_dict.keys()), self.sample_id: list(read_count_dict.values()) }) del read_count_dict gc.collect() if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- mean per-gene read count: {2:.4}'. format(self.sample_id, chrom, read_count_df[self.sample_id].mean())) logging.info('SAMPLE {0}, CHR {1} -- saving read counts.'.format( self.sample_id, chrom)) # save sample's chromosome read counts to .csv for joining later. read_count_df.to_csv(count_file, index=False)
help= "Select the level at which labels in the same clusters are combined.", ) parser.add_argument( "--bin-size", type=int, default=2, help="How many labels to group for each synthetic composite label.", ) args = parser.parse_args() data = args.data bin_size = args.bin_size combine_level = args.level folder = f"./dataset/xmc-base/{data}/" out_folder = f"./dataset-binned/{data}/" C = cluster_chain(f"./model/{data}/ranker/**/C.npz", combine_level) os.makedirs(out_folder, exist_ok=True) ytr = sps.load_npz(folder + "Y.trn.npz") yte = sps.load_npz(folder + "Y.tst.npz") mapper, new_nr_labels = combine_from_cluster(C, bin_size) new_ytr = combine_Y(mapper, new_nr_labels, ytr) new_yte = combine_Y(mapper, new_nr_labels, yte) invert_mapper = inversion_mapper(mapper) with open(out_folder + "mapper.pkl", "wb") as writer: pkl.dump(invert_mapper, writer) sps.save_npz(out_folder + "Y.trn.npz", new_ytr) sps.save_npz(out_folder + "Y.tst.npz", new_yte)
def main(): global args args = Params() if args.use_alexnet: print("Using pre-trained alexnet") model = models.alexnet(pretrained=True) model.classifier[6] = nn.Linear(4096, args.num_classes) else: print("Using pre-trained inception_v3") # inception is changed to accept variable size inputs model = inception_v3(pretrained=True) model.fc = nn.Linear(2048, args.num_classes) model.aux_logits = False model = model.cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] args.best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # data loading code train_dataset = birdsnap_loader.BS(args.data_root, args.meta_data, split_name='train', im_size_crop=args.im_size_crop, im_size_resize=args.im_size_resize, is_train=True) val_dataset = birdsnap_loader.BS(args.data_root, args.meta_data, split_name=args.split_name, im_size_crop=args.im_size_crop, im_size_resize=args.im_size_resize, is_train=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: prec1, preds, im_ids, feats = validate(val_loader, model, criterion, True, True) # write predictions to file if args.save_preds: # save dense #np.save(args.op_file_name, feats) # save sparse feats[feats < 0.000001] = 0.0 sp = sparse.csr_matrix(feats) sparse.save_npz(args.op_file_name + '_sparse', sp) # with open(args.op_file_name, 'w') as opfile: # opfile.write('id,predicted\n') # for ii in range(len(im_ids)): # opfile.write(str(im_ids[ii]) + ',' + ' '.join(str(x) for x in preds[ii,:])+'\n') return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion, False) is_better = prec1 > args.best_prec1 # remember best Acc@1 and save checkpoint args.best_prec1 = max(prec1, args.best_prec1) model_state = { 'epoch': epoch + 1, #'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': args.best_prec1, 'optimizer': optimizer.state_dict() } torch.save(model_state, args.model_path + 'checkpoint.pth.tar') if is_better: print('\t* Saving new best model') torch.save(model_state, args.model_path + 'model_best.pth.tar')
def prepare_kddcup10(data_name, min_interactions_per_user, kc_col_name, remove_nan_skills, drop_duplicates=True): """Preprocess KDD Cup 2010 datasets. Arguments: data_name -- "bridge_algebra06" or "algebra05" min_interactions_per_user -- minimum number of interactions per student kc_col_name -- Skills id column remove_nan_skills -- if True, remove interactions with no skill tag drop_duplicates -- if True, drop duplicates from dataset Outputs: df -- preprocessed ASSISTments dataset (pandas DataFrame) Q_mat -- corresponding q-matrix (item-skill relationships sparse array) """ folder_path = os.path.join("data", data_name) df = pd.read_csv(folder_path + "/data.txt", delimiter='\t').rename(columns={ 'Anon Student Id': 'user_id', 'Problem Name': 'pb_id', 'Step Name': 'step_id', kc_col_name: 'kc_id', 'First Transaction Time': 'timestamp', 'Correct First Attempt': 'correct' })[['user_id', 'pb_id', 'step_id' ,'correct', 'timestamp', 'kc_id']] df["timestamp"] = pd.to_datetime(df["timestamp"]) df["timestamp"] = df["timestamp"] - df["timestamp"].min() #df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24)) df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64) df.sort_values(by="timestamp",inplace=True) df.reset_index(inplace=True,drop=True) df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user) # Create variables df["item_id"] = df["pb_id"]+":"+df["step_id"] df = df[['user_id', 'item_id', 'kc_id', 'correct', 'timestamp']] if drop_duplicates: df.drop_duplicates(subset=["user_id", "item_id", "timestamp"], inplace=True) if remove_nan_skills: df = df[~df["kc_id"].isnull()] else: df.ix[df["kc_id"].isnull(), "kc_id"] = 'NaN' # Create list of KCs listOfKC = [] for kc_raw in df["kc_id"].unique(): for elt in kc_raw.split('~~'): listOfKC.append(elt) listOfKC = np.unique(listOfKC) dict1_kc = {} dict2_kc = {} for k, v in enumerate(listOfKC): dict1_kc[v] = k dict2_kc[k] = v # Transform ids into numeric df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1] df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1] df.reset_index(inplace=True, drop=True) # Add unique identifier of the row df["inter_id"] = df.index # Build Q-matrix Q_mat = np.zeros((len(df["item_id"].unique()), len(listOfKC))) item_skill = np.array(df[["item_id","kc_id"]]) for i in range(len(item_skill)): splitted_kc = item_skill[i,1].split('~~') for kc in splitted_kc: Q_mat[item_skill[i,0],dict1_kc[kc]] = 1 df = df[['user_id', 'item_id', 'timestamp', 'correct', 'inter_id']] df = df[df.correct.isin([0,1])] # Remove potential continuous outcomes df['correct'] = df['correct'].astype(np.int32) # Cast outcome as int32 # Save data sparse.save_npz(folder_path + "/q_mat.npz", sparse.csr_matrix(Q_mat)) df.to_csv(folder_path + "/preprocessed_data.csv", sep="\t", index=False) return df, Q_mat
temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0) temp[temp < 1e-6] = 0 return md5, csr_matrix(temp) gc.collect() print('[{}] Loading predictions...'.format(str(datetime.datetime.now()))) ms = [scipy.sparse.load_npz(str(x).replace('csv', 'npz')) for x in tqdm(tta_file_names)] result = Parallel(n_jobs=12)(delayed(get_probs)(md5) for md5 in common_md5s) # result = [get_probs(i) for i in tqdm(file_names.index)] print('[{}] Unzippping...'.format(str(datetime.datetime.now()))) pred_md5_list, probs = zip(*result) probs = vstack(probs) labels = pd.DataFrame({'md5': pred_md5_list}) print('[{}] Saving labels...'.format(str(datetime.datetime.now()))) labels.to_csv(str(model_path / (args.average_type + '_{model_type}_md5_list.csv'.format(model_type=args.model_type))), index=False) print('[{}] Saving predictions...'.format(str(datetime.datetime.now()))) save_npz(str(model_path / (args.average_type + '_{model_type}_probs.npz'.format(model_type=args.model_type))), probs)
elif average_type == 'gmean': temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0) temp[temp < 1e-6] = 0 return file_to_md5[image_name], csr_matrix(temp) result_path = Path('data') / 'prediction' / 'global' result_path.mkdir(exist_ok=True, parents=True) result = Parallel(n_jobs=12)(delayed(get_probs)(i) for i in file_names.index) # # result = [get_probs(i) for i in tqdm(file_names.index)] print('[{}] Unzippping...'.format(str(datetime.datetime.now()))) pred_md5_list, probs = zip(*result) probs = vstack(probs) labels = pd.DataFrame({'md5': pred_md5_list}) print('[{}] Saving labels...'.format(str(datetime.datetime.now()))) labels.to_csv(str(result_path / (average_type + '_last_md5_list.csv')), index=False) print('[{}] Saving predictions...'.format(str(datetime.datetime.now()))) save_npz(str(result_path / (average_type + '_last_probs.npz')), probs)
'{}naive'.format(args.partition)) try: os.mkdir(partition_dataset) except FileExistsError: pass chunk_size = int(len(train_nid) / args.partition) for pid in range(args.partition): start_ofst = chunk_size * pid if pid == args.partition - 1: end_ofst = len(train_nid) else: end_ofst = start_ofst + chunk_size part_nid = train_nid[start_ofst:end_ofst] subadj, sub2fullid, subtrainid = get_sub_graph(dgl_g, part_nid, args.num_hops) sublabel = labels[sub2fullid[subtrainid]] # files subadj_file = os.path.join(partition_dataset, 'subadj_{}.npz'.format(str(pid))) sub_trainid_file = os.path.join(partition_dataset, 'sub_trainid_{}.npy'.format(str(pid))) sub_train2full_file = os.path.join( partition_dataset, 'sub_train2fullid_{}.npy'.format(str(pid))) sub_label_file = os.path.join(partition_dataset, 'sub_label_{}.npy'.format(str(pid))) spsp.save_npz(subadj_file, subadj) np.save(sub_trainid_file, subtrainid) np.save(sub_train2full_file, sub2fullid) np.save(sub_label_file, sublabel)
def _save_and_load(matrix): with tempfile.NamedTemporaryFile(suffix='.npz') as file: file = file.name save_npz(file, matrix) loaded_matrix = load_npz(file) return loaded_matrix
# keep count k += 1 if k % 10000 == 0: print("%s/%s" % (k, num_tokens)) start = max(0, i - context_size) end = min(len(line_as_idx), i + context_size) for c in line_as_idx[start:i]: wc_counts[w, c] += 1 for c in line_as_idx[i + 1:end]: wc_counts[w, c] += 1 time_cost = round((time() - t0) / 60, 2) print(f"Finished counting, time cost: {time_cost} mins") save_npz(pmi_path, csr_matrix(wc_counts)) else: wc_counts = load_npz(pmi_path) # context counts get raised ^ 0.75 c_counts = wc_counts.sum(axis=0).A.flatten()**0.75 c_probs = c_counts / c_counts.sum() c_probs = c_probs.reshape(1, V) # PMI(w, c) = #(w, c) / #(w) / p(c) # pmi = wc_counts / wc_counts.sum(axis=1) / c_probs # works only if numpy arrays pmi = wc_counts.multiply(1.0 / wc_counts.sum(axis=1) / c_probs).tocsr() # this operation changes it to a coo_matrix # which doesn't have functions we need, e.g log1p() # so convert it back to a csr
count = 0 def update_train(row): global count count += 1 if count % 100000 == 0: print("processed: %.3f" % (float(count)/cutoff)) i = int(row.userId) j = int(row.movie_idx) A[i,j] = row.rating df_train.apply(update_train, axis=1) # mask, to tell us which entries exist and which do not A = A.tocsr() mask = (A > 0) save_npz("Atrain.npz", A) # test ratings dictionary A_test = lil_matrix((N, M)) print("Calling: update_test") count = 0 def update_test(row): global count count += 1 if count % 100000 == 0: print("processed: %.3f" % (float(count)/len(df_test))) i = int(row.userId) j = int(row.movie_idx) A_test[i,j] = row.rating df_test.apply(update_test, axis=1)
#--------------------------------------------------------------------------- # if __name__ == '__main__': data_file, until_idx = grabArguments() print('Loading Data...') data_filename, data_file_ext = path.splitext(data_file) if data_file_ext == '.npz': data = sparse.load_npz(data_file) is_sparse = True else: data = np.load(data_file, allow_pickle=True) is_sparse = False print(f'input shape: {data.shape}') print('Truncating Data...') if is_sparse: data = data.tocsr()[:until_idx] else: data = data[:until_idx] print(f'output shape: {data.shape}') print('Saving Data...') output_file = data_filename + '_truncated' if is_sparse: output_file += data_file_ext sparse.save_npz(output_file, data) else: np.save(output_file, data)
def data_final(): data = pd.read_csv(data_path + "mergedDataFillna.csv") #print (data) #遇到的一些问题,有人婚姻状况是两个,有很多,这个神奇的操作.不过这个状态少,感觉也没啥关系 one_hot_feature = [ 'LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender', 'house', 'os', 'ct', 'marriageStatus', 'advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType' ] #这个特征,都是每个里面是598 872 2602 2964 1189 631 5606 5719 5859 5708......这种的,就是比如兴趣爱好标签,他可能有一大堆......这个要处理 vector_feature = [ 'appIdAction', 'appIdInstall', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3' ] #这一段是把写出花的特征的类别弄成0,1,2这种(婚姻那个觉得也没事,顶多顶多,五六种而已......多个状态并存感觉问题也不大) for feature in one_hot_feature: try: data[feature] = LabelEncoder().fit_transform( data[feature].apply(int)) except: data[feature] = LabelEncoder().fit_transform(data[feature]) train = data[data.label != -1] train_y = train.pop('label') #pop的作用,在train里删除label这一列,并且返回label这一列 test = data[data.label == -1] #print (test) res = test[['aid', 'uid']] test = test.drop('label', axis=1) # train_x=train[one_hot_feature] # test_x=test[one_hot_feature] # train_x["creativeSize"]=train["creativeSize"] # test_x["creativeSize"] = test["creativeSize"] train_x = train[[ 'creativeSize' ]] #之前的creativeSize没处理过,这里单独弄出来,为了以后拼接用。然后用[[]]就是dataframe格式,[]是series格式 test_x = test[['creativeSize']] # 虽然lightgbm不用one-hot,但是这个稀疏矩阵的存法,也没办法在lightgbm中找列......所以还是做one-hot吧 enc = OneHotEncoder() for feature in one_hot_feature: enc.fit(data[feature].values.reshape(-1, 1)) train_a = enc.transform(train[feature].values.reshape(-1, 1)) test_a = enc.transform(test[feature].values.reshape(-1, 1)) train_x = sparse.hstack((train_x, train_a)) #稀疏矩阵的拼接 test_x = sparse.hstack((test_x, test_a)) print("one-hot prepared!") #countvectorizer的特征,感觉这个countvectorizer就是加强版的one-hot...... cv = CountVectorizer() for feature in vector_feature: #print (data[feature]) #print (data[feature].dtypes) cv.fit( data[feature] ) #这里的一个启发,以前我都是train和test各自处理,就会有行列类别对不上的情况,比如这个爱好test里有,但是train里没有,然后就要再处理,非常不方便。这里就是,train和test放在一起弄,就不用怕test里有train里没有了 train_a = cv.transform(train[feature]) #print (train_a) #从这个输出看,这个countvectorizer就弄成稀疏矩阵(sparse matrix)了 test_a = cv.transform(test[feature]) train_x = sparse.hstack((train_x, train_a)) test_x = sparse.hstack((test_x, test_a)) #存成了稀疏矩阵可以直接用lightgbm print("cv prepared!") #print (train_x) #print (train_y) #print (test_x) #print (train_x) sparse.save_npz(data_path + "train_x.npz", train_x) sparse.save_npz(data_path + "test_x.npz", test_x) train_y.to_csv(data_path + "train_y.csv", index=False) res.to_csv(data_path + "res.csv", index=False)
Prepare sparse features ''' X = {} X['users'] = onehotize(df['user'], config['nb_users']) X['items'] = onehotize(df['item'], config['nb_items']) if 'skill' in df: X['skills'] = onehotize(df['skill'], config['nb_skills']) X['wins'] = X['skills'].copy() X['wins'].data = df['wins'] X['fails'] = X['skills'].copy() X['fails'].data = df['fails'] X_train = hstack([X[agent] for agent in active_features]).tocsr() y_train = df['correct'].values return X_train, y_train df = pd.read_csv('data.csv') with open('config.yml') as f: config = yaml.safe_load(f) print('Configuration', config) X, y = df_to_sparse(df, config, active_features) print(df.head()) if options.dataset == 'dummy': print(X.todense()) save_npz('X-{:s}.npz'.format(features_suffix), X) np.save('y-{:s}.npy'.format(features_suffix), y) print( 'Successfully created X-{:s}.npz and y-{:s}.npy in data/{} folder'.format( features_suffix, features_suffix, options.dataset))
def create_affinity(X, knn, scale=None, alg="annoy", savepath=None, W_path=None): N, D = X.shape if W_path is not None: if W_path.endswith('.mat'): W = sio.loadmat(W_path)['W'] elif W_path.endswith('.npz'): W = sparse.load_npz(W_path) else: print('Compute Affinity ') start_time = timeit.default_timer() if alg == "flann": print('with Flann') flann = FLANN() knnind, dist = flann.nn(X, X, knn, algorithm="kdtree", target_precision=0.9, cores=5) # knnind = knnind[:,1:] # elif alg == "annoy": # print('with annoy') # ann = AnnoyIndex(D, metric='euclidean') # for i, x_ in enumerate(X): # ann.add_item(i, x_) # ann.build(50) # knnind = np.empty((N, knn)) # dist = np.empty((N, knn)) # for i in range(len(X)): # nn_i = ann.get_nns_by_item(i, knn, include_distances=True) # knnind[i,:] = np.array(nn_i[0]) # dist[i,:] = np.array(nn_i[1]) else: nbrs = NearestNeighbors(n_neighbors=knn).fit(X) dist, knnind = nbrs.kneighbors(X) row = np.repeat(range(N), knn - 1) col = knnind[:, 1:].flatten() if scale is None: data = np.ones(X.shape[0] * (knn - 1)) else: data = np.exp((-dist[:, 1:]**2) / (2 * scale**2)).flatten() W = sparse.csc_matrix((data, (row, col)), shape=(N, N), dtype=np.float) # W = (W + W.transpose(copy=True)) /2 elapsed = timeit.default_timer() - start_time print(elapsed) if isinstance(savepath, str): if savepath.endswith('.npz'): sparse.save_npz(savepath, W) elif savepath.endswith('.mat'): sio.savemat(savepath, {'W': W}) return W
def main(): # get args args = TransformerMatcher.get_args_and_set_logger()["args"] # do_train and save model if args.do_train: # setup output_dir if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # load data with open(args.trn_feat_path, "rb") as fin: X_trn = pickle.load(fin) C_trn = smat.load_npz(args.trn_label_path) # prepare transformer pretrained models TransformerMatcher.set_device(args) matcher = TransformerMatcher(num_clusters=C_trn.shape[1]) matcher.prepare_model(args) # train matcher.train(args, X_trn, C_trn) if args.local_rank in [-1, 0]: matcher.save_model(args) # do_eval on test set and save prediction output if args.do_eval: # we only support multigpu mode but not distributed mode assert args.local_rank == -1 # load data with open(args.trn_feat_path, "rb") as fin: X_trn = pickle.load(fin) with open(args.tst_feat_path, "rb") as fin: X_tst = pickle.load(fin) C_trn = smat.load_npz(args.trn_label_path) C_tst = smat.load_npz(args.tst_label_path) # load fine-tuned model in the args.output_dir TransformerMatcher.set_device(args) matcher = TransformerMatcher(num_clusters=C_trn.shape[1]) args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[ args.model_type] matcher.config = config_class.from_pretrained(args.output_dir) matcher.config.output_hidden_states = True model = model_class.from_pretrained(args.output_dir, config=matcher.config) model.to(args.device) matcher.model = model # predict trn_loss, trn_metrics, C_trn_pred, trn_embeddings = matcher.predict( args, X_trn, C_trn, topk=args.only_topk, get_hidden=True) tst_loss, tst_metrics, C_tst_pred, tst_embeddings = matcher.predict( args, X_tst, C_tst, topk=args.only_topk, get_hidden=True) logger.info("| matcher_trn_prec {}".format(" ".join( "{:4.2f}".format(100 * v) for v in trn_metrics.prec))) logger.info("| matcher_trn_recl {}".format(" ".join( "{:4.2f}".format(100 * v) for v in trn_metrics.recall))) logger.info("| matcher_tst_prec {}".format(" ".join( "{:4.2f}".format(100 * v) for v in tst_metrics.prec))) logger.info("| matcher_tst_recl {}".format(" ".join( "{:4.2f}".format(100 * v) for v in tst_metrics.recall))) # save C_trn_pred.npz and trn_embedding.npy trn_csr_codes = rf_util.smat_util.sorted_csr(C_trn_pred, only_topk=args.only_topk) trn_csr_codes = transform_prediction(trn_csr_codes, transform="lpsvm-l2") csr_codes_path = os.path.join(args.output_dir, "C_trn_pred.npz") smat.save_npz(csr_codes_path, trn_csr_codes) embedding_path = os.path.join(args.output_dir, "trn_embeddings.npy") np.save(embedding_path, trn_embeddings) # save C_eval_pred.npz and tst_embedding.npy tst_csr_codes = rf_util.smat_util.sorted_csr(C_tst_pred, only_topk=args.only_topk) tst_csr_codes = transform_prediction(tst_csr_codes, transform="lpsvm-l2") csr_codes_path = os.path.join(args.output_dir, "C_tst_pred.npz") smat.save_npz(csr_codes_path, tst_csr_codes) embedding_path = os.path.join(args.output_dir, "tst_embeddings.npy") np.save(embedding_path, tst_embeddings)
def save_term_by_document_matrix(matrix): filepath = os.path.join(os.getcwd(), "results", "term_by_document_matrix.npz") sparse.save_npz(filepath, matrix)
def h_construction(outfile, WithRTPrediction_, WithCosineH_, initRT_tuple, initRT_width,\ V_mat, W_mat, noise_number, globalparam_list, iso_maxnumber, gaussian_width, cos_cutoff, output_file): gc.disable() V_shape, W_shape = V_mat.shape, W_mat.shape Hpeak_mean = 1 * W_shape[0] / W_shape[1] #proofed per row if WithRTPrediction_: if W_shape[1] - noise_number != len(initRT_tuple): print 'Warning W_shape is wrong' exit() H_mat, initRT_correct = h_prediction(initRT_tuple, initRT_width, \ Hpeak_mean, noise_number, globalparam_list, gaussian_width) # H_mat = H_mat.todense() else: H_mat = h_constant(V_shape, W_shape, Hpeak_mean, noise_number) initRT_correct = None if WithCosineH_: H_size_rowcut = 250 V_mat = V_mat.tocsr() W_mat = W_mat.tocsc() H_mat = H_mat.tocsr() if H_mat.shape[0] > H_size_rowcut: H_mat_list, H_cosmat_list = [], [] for row in np.arange(0, H_mat.shape[0], H_size_rowcut): if row + H_size_rowcut >= H_mat.shape[0]: row_end = H_mat.shape[0] else: row_end = row + H_size_rowcut H_mat_out, H_cosmatdub_out = h_cos(V_mat, W_mat[:, row:row_end], H_mat[row:row_end, :], iso_maxnumber, cos_cutoff) H_mat_list.append(H_mat_out) del (H_mat_out) H_cosmat_list.append(H_cosmatdub_out) del (H_cosmatdub_out) H_cosmat = sparse.vstack(H_cosmat_list) H_cosmat = H_cosmat.tocoo() del (H_cosmat_list) H_mat = sparse.vstack(H_mat_list) del (H_mat_list) else: H_mat, H_cosmat = h_cos(V_mat, W_mat, H_mat, iso_maxnumber, cos_cutoff) else: H_cosmat = None if noise_number > 0: noise_mat = np.zeros((noise_number, H_mat.shape[1])) noise_mat += Hpeak_mean / H_mat.shape[1] try: H_mat = sparse.vstack([H_mat, noise_mat]) H_mat.tocoo() except: H_mat = np.vstack([H_mat, noise_mat]) # change to sparse if better for memory if np.count_nonzero(H_mat) * 3 < H_mat.size: H_mat = sparse.coo_matrix(H_mat) sparse.save_npz(str(output_file) + '_Hmat_init.npz', H_mat) return H_mat, initRT_correct, H_cosmat
def weight_SWSN(ann_matrix, sparse_nets=None, normalized_nets=None, net_names=None, out_file=None, nodes=None, verbose=False): """ *normalized_nets*: list of networks stored as scipy sparse matrices. Should already be normalized """ # UPDATED: normalize the networks if sparse_nets is not None: print("Normalizing the networks") normalized_nets = [] for net in sparse_nets: normalized_nets.append(_net_normalize(net)) elif normalized_nets is None: print("No networks given. Nothing to do") return None, 0 if len(normalized_nets) == 1: print("Only one network given to weight_SWSN. Nothing to do.") total_time = 0 return sparse_nets[0], total_time if verbose: print("Removing rows with 0 annotations/positives") utils.print_memory_usage() # remove rows with 0 annotations/positives empty_rows = [] for i in range(ann_matrix.shape[0]): pos, neg = alg_utils.get_term_pos_neg(ann_matrix, i) # the combineWeightsSWSN method doesn't seem to # work if there's only 1 positive if len(pos) <= 1 or len(neg) <= 1: empty_rows.append(i) # don't modify the original annotation matrix to keep the rows matching the GO ids curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows) if verbose: utils.print_memory_usage() print("Weighting networks for %d different terms" % (curr_ann_mat.shape[0])) print("Running simultaneous weights with specific negatives") start_time = time.process_time() alpha, indices = combineNetworksSWSN(curr_ann_mat, normalized_nets, verbose=verbose) # print out the computed weights for each network if net_names is not None: print("network weights:") #print("\tnetworks chosen: %s" % (', '.join([net_names[i] for i in indices]))) weights = defaultdict(int) for i in range(len(alpha)): weights[net_names[indices[i]]] = alpha[i] weights_table = ["%0.3e" % weights[net] for net in net_names] print('\t'.join(net_names)) print('\t'.join(weights_table)) # now add the networks together with the alpha weight applied weights_list = [0] * len(normalized_nets) weights_list[indices[0]] = alpha[0] combined_network = alpha[0] * normalized_nets[indices[0]] for i in range(1, len(alpha)): combined_network += alpha[i] * normalized_nets[indices[i]] weights_list[indices[i]] = alpha[i] total_time = time.process_time() - start_time if out_file is not None: # replace the .txt if present out_file = out_file.replace('.txt', '.npz') utils.checkDir(os.path.dirname(out_file)) print("\twriting combined network to %s" % (out_file)) sp.save_npz(out_file, combined_network) # also write the node ids so it's easier to access # TODO figure out a better way to store this node2idx_file = out_file + "-node-ids.txt" print("\twriting node ids to %s" % (node2idx_file)) with open(node2idx_file, 'w') as out: out.write(''.join("%s\t%s\n" % (n, i) for i, n in enumerate(nodes))) # write the alpha/weight of the networks as well net_weight_file = out_file + "-net-weights.txt" print("\twriting network weights to %s" % (net_weight_file)) with open(net_weight_file, 'w') as out: out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i])) for i, idx in enumerate(indices))) return combined_network, total_time, weights_list
def preprocess(pvalue_thr=1e-200, cancer_type='BRCA'): ############################################################## ####### Select pseudogene and coding genes ############################################################## print("Select pseudogene and coding genes") gencode = read_gtf("../data/raw_data/gencode.v29.annotation.gtf") gencode = gencode[gencode['feature'] == 'gene'] # select pseudogenes pseudogene = gencode[gencode['gene_type'].isin([ "transcribed_unprocessed_pseudogene", "transcribed_processed_pseudogene", "translated_processed_pseudogene" ])] pseudogene = pseudogene.drop([ "score", "strand", "frame", "level", "tag", "exon_number", "exon_id", "ont", "protein_id", "ccdsid", "transcript_support_level", "havana_transcript", "havana_gene", "source", "transcript_id", "transcript_name", "transcript_type" ], axis=1) pseudogene.drop_duplicates(subset=['gene_name'], keep='first', inplace=True) print("Pseudogene number: ", pseudogene.shape[0]) # select coding genes coding = gencode[gencode['gene_type'] == 'protein_coding'] coding = coding.drop([ "score", "strand", "frame", "level", "tag", "exon_number", "exon_id", "ont", "protein_id", "ccdsid", "transcript_support_level", "havana_transcript", "havana_gene", "source", "transcript_id", "transcript_name", "transcript_type" ], axis=1) coding.drop_duplicates(subset=['gene_name'], keep='first', inplace=True) print("coding gene number: ", coding.shape[0]) ############################################################## ####### generate genome sequence ############################################################## print("generate genome sequence") with open("../data/raw_data/GRCh38.primary_assembly.genome.fa") as f: data = f.readlines() chr_seq_map = dict() i = 0 while i < len(data): if data[i][0] == ">": key = data[i].split(" ")[0] j = 1 temp = [] while (i + j) < len(data): if data[i + j][0] != ">": temp.append(data[i + j][:-1]) j = j + 1 else: break value = "".join(temp) chr_seq_map[key] = value i = i + j chr_seq_mapping = dict() for key, value in chr_seq_map.items(): if key[:4] == '>chr': chr_seq_mapping[key[1:]] = value def func(x): temp = chr_seq_mapping[x['seqname']] return temp[x['start']:x['end']] pseudogene['sequence'] = pseudogene.apply(func, axis=1) coding['sequence'] = coding.apply(func, axis=1) all_genes = pseudogene.append(coding, ignore_index=True, sort=False) all_genes.drop_duplicates(subset=['gene_name'], keep='first', inplace=True) ################################################################## ######## choose final pseudogene and coding candidates pseudo_list = all_genes[ all_genes['gene_type'] != 'protein_coding']['gene_name'].values coding_list = all_genes[all_genes['gene_type'] == 'protein_coding']['gene_name'].values # build similarity network print("filtering blast results") similarity_res = pd.read_csv("../data/raw_data/blast_similarity.csv", names=['query', 'target', 'evalue']) similarity_res = similarity_res[similarity_res['evalue'] < pvalue_thr] # delete self-self pairs similarity_res = similarity_res[(similarity_res['query'] != similarity_res['target'])] # Only select pseudogene as the query similarity_select = similarity_res[similarity_res['query'].isin( pseudo_list)] # select corresonding coding genes similarity_candidate = set( similarity_select['target'].unique()) | set(pseudo_list) # filter by the candidates similarity_final = similarity_res[similarity_res['query'].isin( similarity_candidate)] similarity_final = similarity_final[similarity_final['target'].isin( similarity_final['query'].unique())] final_similarity_candidate = np.array( list( set(similarity_final['query'].unique()) | set(similarity_final['target'].unique()))) # this is all the genes we will use in our model final_all_genes = all_genes[all_genes['gene_name'].isin( final_similarity_candidate)] final_all_genes.index = range(len(final_all_genes)) all_genes_mapping = dict( zip(final_all_genes['gene_name'], range(len(final_all_genes)))) print("Dataset size: ", final_all_genes.shape[0]) ################################################################## ###################### Build Networks ############################ print("build similarity adj matrix") similarity_final['query_id'] = similarity_final['query'].apply( lambda x: all_genes_mapping[x]) similarity_final['target_id'] = similarity_final['target'].apply( lambda x: all_genes_mapping[x]) adj_simi = np.zeros((len(all_genes_mapping), len(all_genes_mapping))) for i, row in tqdm(similarity_final.iterrows()): adj_simi[row['query_id']][row['target_id']] = 1 sAdj_simi = sparse.csr_matrix(adj_simi) sparse.save_npz("../data/final_input/adj_simi.npz", sAdj_simi) print("build TCGA co-expression network") df = pd.read_csv('../data/raw_data/TCGA_' + cancer_type + '.csv', names=[ "index", "id", "name", "geneSymbol", "MedianExpValueTumor", "MedianExpValueNormal", "log_aveExpValueTumor", "log_aveExpValueNormal", "expValuesTumor", "expValuesNormal", "log_expValuesTumor", "log_expValuesNormal", "paired" ], index_col=["index"], skipinitialspace=True) expression_new = df[['geneSymbol', 'log_expValuesTumor']] # only select genes that are included in the pseudogene and coding gene list all_genes_name = final_all_genes['gene_name'].values expression_selected = expression_new[expression_new['geneSymbol'].isin( all_genes_name)] expression_selected = expression_selected.reset_index(drop=True) # build expression networks expression_pairs = build_expression_network(expression_selected) final_co_expression = defaultdict(list) for key, value in tqdm(expression_pairs.items()): temp = [(all_genes_mapping[x[0]], x[1]) for x in value] final_co_expression[all_genes_mapping[key]] = temp adj_co_expression = np.zeros( (len(all_genes_mapping), len(all_genes_mapping))) for key, value in tqdm(final_co_expression.items()): for x in value: adj_co_expression[key][x[0]] = 1 sAdj_co = sparse.csr_matrix(adj_co_expression) sparse.save_npz("../data/final_input/adj_TCGA_" + cancer_type + ".npz", sAdj_co) print("generate node2vec embeddings for co-expression network") G_coexp = nx.from_scipy_sparse_matrix(sAdj_co) node2vec_coexp = Node2Vec(G_coexp, dimensions=256, walk_length=15, num_walks=150, workers=28) model_coexp = node2vec_coexp.fit(window=10, min_count=1, batch_words=4) model_coexp.wv.save_word2vec_format("../data/final_input/node2vec_TCGA_" + cancer_type + ".txt") print("build ppi and genetic interaction network") biogrid = pd.read_table("../data/raw_data/BIOGRID-ALL-3.5.173.tab2.txt") biogrid = biogrid[(biogrid['Organism Interactor A'] == 9606) & (biogrid['Organism Interactor B'] == 9606)] biogrid = biogrid[[ '#BioGRID Interaction ID', 'Entrez Gene Interactor A', 'Entrez Gene Interactor B', 'Official Symbol Interactor A', 'Official Symbol Interactor B', 'Experimental System', 'Experimental System Type' ]] biogrid = biogrid[(biogrid['Official Symbol Interactor A'].isin( final_all_genes['gene_name'].unique())) & (biogrid['Official Symbol Interactor B'].isin( final_all_genes['gene_name'].unique()))] biogrid['query_id'] = biogrid['Official Symbol Interactor A'].apply( lambda x: all_genes_mapping[x]) biogrid['target_id'] = biogrid['Official Symbol Interactor B'].apply( lambda x: all_genes_mapping[x]) adj_ppi = np.zeros((len(all_genes_mapping), len(all_genes_mapping))) for i, row in tqdm(biogrid.iterrows()): adj_ppi[row['query_id']][row['target_id']] = 1 sAdj_ppi = sparse.csr_matrix(adj_ppi) sparse.save_npz("../data/final_input/adj_ppi.npz", sAdj_ppi) print( "generate node2vec embeddings for PPI and genetic interaction network") G_ppi = nx.from_scipy_sparse_matrix(sAdj_ppi) node2vec_ppi = Node2Vec(G_ppi, dimensions=256, walk_length=15, num_walks=150, workers=28) model_ppi = node2vec_ppi.fit(window=10, min_count=1, batch_words=4) model_ppi.wv.save_word2vec_format("../data/final_input/node2vec_ppi.txt") ########################################################## ############ Generate feature dataframe ################## print("Get GO labels for both pseudogenes and coding genes") goa = pd.read_csv("../data/raw_data/goa_human.gaf", sep="\t", skiprows=31, names=[ 'DB', 'DB Object ID', 'DB Object Symbol', 'Qualifier', 'GO', 'reference', 'Evidence', 'with form', 'Aspect', 'DB Object Name', 'Synonym', 'type', 'Taxon', 'Date', 'Assigned by', 'extension', 'Gene product form ID' ]) goa = goa[goa['DB Object Symbol'].isin(final_all_genes['gene_name'])] goa_F = goa[goa['Aspect'] == 'F'] goa_P = goa[goa['Aspect'] == 'P'] goa_C = goa[goa['Aspect'] == 'C'] final_all_genes['MF'] = final_all_genes['gene_name'].apply( lambda x: list(goa_F[goa_F['DB Object Symbol'] == x]['GO'])) final_all_genes['BP'] = final_all_genes['gene_name'].apply( lambda x: list(goa_P[goa_P['DB Object Symbol'] == x]['GO'])) final_all_genes['CC'] = final_all_genes['gene_name'].apply( lambda x: list(goa_C[goa_C['DB Object Symbol'] == x]['GO'])) from go_anchestor import get_gene_ontology, get_anchestors go = get_gene_ontology() BIOLOGICAL_PROCESS = 'GO:0008150' MOLECULAR_FUNCTION = 'GO:0003674' CELLULAR_COMPONENT = 'GO:0005575' new_cc = [] new_mf = [] new_bp = [] for i, row in final_all_genes.iterrows(): labels = row['CC'] temp = set([]) for x in labels: temp = temp | get_anchestors(go, x) temp.discard(CELLULAR_COMPONENT) new_cc.append(list(temp)) labels = row['MF'] temp = set([]) for x in labels: temp = temp | get_anchestors(go, x) temp.discard(MOLECULAR_FUNCTION) new_mf.append(list(temp)) labels = row['BP'] temp = set([]) for x in labels: temp = temp | get_anchestors(go, x) temp.discard(BIOLOGICAL_PROCESS) new_bp.append(list(temp)) final_all_genes['cc'] = new_cc final_all_genes['mf'] = new_mf final_all_genes['bp'] = new_bp mf_items = [item for sublist in final_all_genes['mf'] for item in sublist] mf_unique_elements, mf_counts_elements = np.unique(mf_items, return_counts=True) bp_items = [item for sublist in final_all_genes['bp'] for item in sublist] bp_unique_elements, bp_counts_elements = np.unique(bp_items, return_counts=True) cc_items = [item for sublist in final_all_genes['cc'] for item in sublist] cc_unique_elements, cc_counts_elements = np.unique(cc_items, return_counts=True) mf_list = mf_unique_elements[np.where(mf_counts_elements > 25)] cc_list = cc_unique_elements[np.where(cc_counts_elements > 25)] bp_list = bp_unique_elements[np.where(bp_counts_elements > 250)] print("CC:", len(cc_list)) print("MF:", len(mf_list)) print("BP:", len(bp_list)) temp_mf = final_all_genes['mf'].apply( lambda x: list(set(x) & set(mf_list))) final_all_genes['temp_mf'] = temp_mf temp_cc = final_all_genes['cc'].apply( lambda x: list(set(x) & set(cc_list))) final_all_genes['temp_cc'] = temp_cc temp_bp = final_all_genes['bp'].apply( lambda x: list(set(x) & set(bp_list))) final_all_genes['temp_bp'] = temp_bp mf_dict = dict(zip(list(mf_list), range(len(mf_list)))) cc_dict = dict(zip(list(cc_list), range(len(cc_list)))) bp_dict = dict(zip(list(bp_list), range(len(bp_list)))) mf_encoding = [[0] * len(mf_dict) for i in range(len(final_all_genes))] cc_encoding = [[0] * len(cc_dict) for i in range(len(final_all_genes))] bp_encoding = [[0] * len(bp_dict) for i in range(len(final_all_genes))] for i, row in final_all_genes.iterrows(): for x in row['temp_mf']: mf_encoding[i][mf_dict[x]] = 1 for x in row['temp_cc']: cc_encoding[i][cc_dict[x]] = 1 for x in row['temp_bp']: bp_encoding[i][bp_dict[x]] = 1 final_all_genes['cc_label'] = cc_encoding final_all_genes['mf_label'] = mf_encoding final_all_genes['bp_label'] = bp_encoding final_all_genes.rename(columns={ "temp_mf": "filter_mf", "temp_bp": "filter_bp", "temp_cc": "filter_cc" }, inplace=True) final_all_genes.drop(columns=['MF', 'CC', 'BP', 'mf', 'cc', 'bp'], inplace=True) with open("../data/final_input/mf_list.txt", "w") as f: for x in list(mf_list): f.write(x + "\n") with open("../data/final_input/cc_list.txt", "w") as f: for x in list(cc_list): f.write(x + "\n") with open("../data/final_input/bp_list.txt", "w") as f: for x in list(bp_list): f.write(x + "\n") print("Add microRNA interactions as features") miRNA = pd.read_excel("../data/raw_data/miRNA.xlsx") miRNA = miRNA[miRNA['Target Gene'].isin( final_all_genes['gene_name'].unique())] selected_miRNA = miRNA['miRNA'].value_counts().index[( miRNA['miRNA'].value_counts() > 250)] miRNA = miRNA[miRNA['miRNA'].isin(selected_miRNA)] micro_mapping = dict(zip(list(selected_miRNA), range(len(selected_miRNA)))) micro_encoding = [] for i, row in tqdm(final_all_genes.iterrows()): cur_mir = miRNA[miRNA['Target Gene'] == row['gene_name']]['miRNA'] temp_encoding = [0] * len(selected_miRNA) for x in cur_mir: temp_encoding[micro_mapping[x]] = 1 micro_encoding.append(temp_encoding) final_all_genes['microRNA_250'] = micro_encoding with open("../data/final_input/microRNA_list.txt", "w") as f: for x in list(selected_miRNA): f.write(x + "\n") print("Add GTEx median expression profiles") GTEx = pd.read_csv( "../data/raw_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct", skiprows=2, sep='\t') final_all_genes['gene_id'] = final_all_genes['gene_id'].apply( lambda x: x.split(".")[0]) GTEx['Name'] = GTEx['Name'].apply(lambda x: x.split(".")[0]) GTEx_new = pd.DataFrame({ 'gene_id': GTEx['Name'], 'expression': GTEx.iloc[:, 2:].values.tolist() }) GTEx_new.drop_duplicates(subset=['gene_id'], keep='first', inplace=True) final_all_genes = pd.merge(final_all_genes, GTEx_new, on='gene_id', how='left') final_all_genes['expression'] = final_all_genes['expression'].apply( lambda d: d if isinstance(d, list) else [0.0] * 54) def reshape(features): return np.hstack(features).reshape((len(features), len(features[0]))) print("generate GTEx node2vec features...") expression = reshape(final_all_genes['expression'].values).T cor_matrix, pval = spearmanr(expression, nan_policy='omit') cor_matrix = np.nan_to_num(cor_matrix, 0) thr = 0.9 adj_coexp_GTEx = np.zeros( (final_all_genes.shape[0], final_all_genes.shape[0])) adj_coexp_GTEx[cor_matrix > thr] = 1 adj_coexp_GTEx = sparse.csr_matrix(adj_coexp_GTEx) sparse.save_npz("../data/final_input/adj_GTEx.npz", adj_coexp_GTEx) print("generate node2vec embeddings for GTEx co-expression network") G_coexp = nx.from_scipy_sparse_matrix(adj_coexp_GTEx) node2vec_coexp = Node2Vec(G_coexp, dimensions=256, walk_length=15, num_walks=150, workers=28) model_coexp = node2vec_coexp.fit(window=10, min_count=1, batch_words=4) model_coexp.wv.save_word2vec_format( "../data/final_input/node2vec_GTEx.txt") print("Saving feature dataframe") final_all_genes.to_pickle("../data/final_input/features_all.pkl") features_input = final_all_genes.loc[:, [ 'gene_id', 'gene_name', 'gene_type', 'cc_label', 'mf_label', 'bp_label', 'microRNA_250', 'expression' ]] features_input.to_pickle("../data/final_input/features.pkl")
def gen_linear_term(): from sklearn.preprocessing import OneHotEncoder context_features = \ ['sitesetID', 'positionType', 'connectionType', 'telecomsOperator', 'hour', 'hour_weight', 'is_pref_cat'] user_features = \ ['age', 'gender', 'education', 'marriageStatus', 'haveBaby', 'hometown', 'residence', 'user_activity', 'cat_pref'] ad_features = \ ['advertiserID', 'appPlatform', 'appCategory', 'app_popularity'] # 加载 dataset dataset = pd.read_hdf(path_intermediate_dataset + hdf_dataset) # y y = dataset['label'].values # 存储 np.save(path_modeling_dataset + npy_y, y) # 手动释放内存 del y gc.collect() # one-hot context enc_context = OneHotEncoder() context_csc = enc_context.fit_transform( dataset[context_features].values).tocsc() # 存储 save_npz(path_modeling_dataset + npz_context, context_csc) # 手动释放内存 del context_csc gc.collect() # one-hot user enc_user = OneHotEncoder() user_csc = enc_user.fit_transform(dataset[user_features].values).tocsc() # 存储 save_npz(path_modeling_dataset + npz_user, user_csc) # 手动释放内存 del user_csc gc.collect() # one-hot ad enc_ad = OneHotEncoder() ad_csc = enc_ad.fit_transform(dataset[ad_features].values).tocsc() # 存储 save_npz(path_modeling_dataset + npz_ad, ad_csc) # 手动释放内存 del ad_csc gc.collect() # 释放 dataset del dataset gc.collect() # 加载 testset_ol testset_ol = pd.read_hdf(path_intermediate_dataset + hdf_testset_ol) # one-hot context context_csc_test_ol = enc_context.transform( testset_ol[context_features].values).tocsc() # 存储 save_npz(path_modeling_dataset + npz_context_test_ol, context_csc_test_ol) # 手动释放内存 del context_csc_test_ol gc.collect() # one-hot user user_csc_test_ol = enc_user.transform( testset_ol[user_features].values).tocsc() # 存储 save_npz(path_modeling_dataset + npz_user_test_ol, user_csc_test_ol) # 手动释放内存 del user_csc_test_ol gc.collect() # one-hot ad ad_csc_test_ol = enc_ad.transform(testset_ol[ad_features].values).tocsc() # 存储 save_npz(path_modeling_dataset + npz_ad_test_ol, ad_csc_test_ol) # 手动释放内存 del ad_csc_test_ol gc.collect() # 释放 testset_ol del testset_ol gc.collect()
rowAr = [0 for j in range(len(line))] a = csr_matrix((np.ones(len(line)), (rowAr, line)), shape=(1, numFeat)) if testMat == None: testMat = a else: testMat = vstack([testMat, a]) i += 1 if i % 100 == 0: print i a = clf.predict(testMat) wrong = 0 missed_bully = 0 for i in range(len(testLabels)): if a[i] - testLabels[i] != 0: #wrong prediction wrong += 1 if a[i] == 0 and testLabels[i] == 1: #missed a bullying incidence missed_bully += 1 print 'Fraction of wrong guesses on test set: ' + str( float(wrong) / len(testLabels)) print 'Fraction of missed bullying on test set: ' + str( float(missed_bully) / len(testLabels)) #Final SVM update for simulator ultraFinal = vstack([finalMat, devMat, testMat]) finalLabels = trainLabels + devLabels + testLabels clf = svm.NuSVC(.05, probability=True) clf.fit(ultraFinal, finalLabels) joblib.dump(clf, 'model.pkl') save_npz('master_convo.npz', ultraFinal) #for updating during simulation with open('master_labels.txt', 'wb') as f: pickle.dump(finalLabels, f)
def main(): """Create the model and start the evaluation process.""" args = get_arguments() num_steps = file_len(os.path.join(args.img_path, args.data_list)) # Create queue coordinator. coord = tf.train.Coordinator() # Load reader. with tf.name_scope("create_inputs"): reader = ImageReader( os.path.join(args.img_path, "texture"), os.path.join(args.img_path, args.data_list), None, # No defined input size. False, # No random scale. False, # No random mirror. 255, IMG_MEAN, coord, ) image, label = reader.image, reader.label title = reader.queue[0] image_batch, label_batch = ( tf.expand_dims(image, axis=0), tf.expand_dims(label, axis=0), ) # Add one batch dimension. # Create network. net = DeepLabResNetModel( {"data": image_batch}, is_training=False, num_classes=args.num_classes ) # Which variables to load. restore_var = tf.global_variables() # Predictions. raw_output = net.layers["fc1_voc12"] before_argmax = tf.image.resize_bilinear(raw_output, tf.shape(image_batch)[1:3,]) raw_output_up = tf.argmax(before_argmax, dimension=3) pred = tf.expand_dims(raw_output_up, axis=3) hw_only = pred[0, :, :, 0] class_0 = tf.where(tf.equal(hw_only, 0)) class_1 = tf.where(tf.equal(hw_only, 1)) class_2 = tf.where(tf.equal(hw_only, 2)) class_3 = tf.where(tf.equal(hw_only, 3)) class_4 = tf.where(tf.equal(hw_only, 4)) class_5 = tf.where(tf.equal(hw_only, 5)) class_6 = tf.where(tf.equal(hw_only, 6)) # Set up TF session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Load weights. loader = tf.train.Saver(var_list=restore_var) load(loader, sess, args.model_weights) # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) start_time = time.time() os.makedirs(os.path.join(args.img_path, args.body_dir), exist_ok=True) os.makedirs(os.path.join(args.img_path, args.vis_dir), exist_ok=True) # write the header rois_file = os.path.join(args.img_path, "rois.csv") if os.path.isfile(rois_file): print(f"The rois file {rois_file} already exists...") ans = None while all(ans != choice for choice in ("a", "o", "q")): ans = input("Do you want to (a)ppend, (o)verwrite, or (q)uit? ") if ans == "o": print("Overwriting existing rois file...") write_header(rois_file) elif ans == "q": sys.exit(1) else: write_header(rois_file) # Perform inference. t = trange(num_steps, desc="Inference progress", unit="img") for step in t: # run through the model jpg_path, c0, c1, c2, c3, c4, c5, c6, raw_output_up_ = sess.run( [ title, class_0, class_1, class_2, class_3, class_4, class_5, class_6, raw_output_up, ] ) # == First, save the body segmentation == if not args.no_body: # convert to a 2D compressed matrix, because we have a lot of 0's for the # background compressed = sparse.csr_matrix(np.squeeze(raw_output_up_)) fname = os.path.splitext(os.path.basename(str(jpg_path)))[0] out = os.path.join(args.img_path, args.body_dir, fname) sparse.save_npz(out, compressed) # == Next, save the ROIs == if not args.no_rois: img_id = extract_nums_only(fname) for c in (c0, c1, c2, c3, c4, c5, c6): try: min_x = np.min(c[:, 1]) except ValueError: min_x = None try: min_y = np.min(c[:, 0]) except ValueError: min_y = None try: max_x = np.max(c[:, 1]) except ValueError: max_x = None try: max_y = np.max(c[:, 0]) except ValueError: max_y = None # write out the stuff with open(rois_file, "a") as f: f.write( ",".join( (img_id, str(min_x), str(min_y), str(max_x), str(max_y), "\n") ) ) # Save an image of the mask for our own reference every 1000 steps if not args.no_vis and step % args.visualize_step == 0: preds = np.expand_dims(raw_output_up_, axis=3) msk = decode_labels(preds, num_classes=args.num_classes) # the mask im = Image.fromarray(msk[0]) # # Save the mask separately # jpg_path = str(jpg_path).split('/')[-1].split('.')[0] # out = os.path.join(args.vis_dir, jpg_path + '.png') # im.save(out) # Save the mask with background img_orig = Image.open(jpg_path) # create the final result using the mask and the original img = np.array(im) * 0.9 + np.array(img_orig) * 0.7 # clip surpassed colors img[img > 255] = 255 img = Image.fromarray(np.uint8(img)) out = os.path.join(args.img_path, args.vis_dir, fname + ".png") img.save(out) # # print('Image processed {}.png'.format(jpg_path)) t.set_description("Finished " + fname) total_time = time.time() - start_time print(f"The output files have been saved to {args.img_path}/{args.body_dir}") print(f"It took {total_time / num_steps} sec on each image.")
def _adjacency(self, adj_path: str) -> NoReturn: """ Create self.adj_mats and self.degrees. Parameters ---------- adj_path : str Try to use drug-drug adjacency matrices saved in adj_path. If this is not possible, calculate it and save in adj_path. Notes ----- self.adj_mats : Dict[Tuple[int, int], List[sp.csr_matrix]] From edge type to list of adjacency matrices for each edge class (e.g. (1, 1): list of drug-drug adjacency matrices for each se class). In our case all matrix in adj_mats are symmetric. self.degrees : Dict[int, List[int]] Number of connections for each node (0: genes, 1: drugs). """ gene_gene_adj = nx.adjacency_matrix(self.gene_net) # Number of connections for each gene gene_degrees = np.array(gene_gene_adj.sum(axis=0)).squeeze() drug_gene_adj = create_adj_matrix( a_item2b_item=self.stitch2proteins, ordered_list_a_item=self.ordered_list_of_drugs, ordered_list_b_item=self.ordered_list_of_proteins) gene_drug_adj = drug_gene_adj.transpose(copy=True) num_se = len(self.ordered_list_of_se) drug_drug_adj_list = [] try: print("Try to load drug-drug adjacency matrices from file.") if len(os.listdir(adj_path)) < num_se: raise IOError('Not all drug-drug adjacency matrices are saved') for i in range(num_se): drug_drug_adj_list.append( sp.load_npz(adj_path + '/sparse_matrix%04d.npz' % i).tocsr()) except IOError: print('Calculate drug-drug adjacency matrices') drug_drug_adj_list = create_combo_adj( combo_a_item2b_item=self.combo2se, combo_a_item2a_item=self.combo2stitch, ordered_list_a_item=self.ordered_list_of_drugs, ordered_list_b_item=self.ordered_list_of_se) print("Saving matrices to file") for i in range(len(drug_drug_adj_list)): sp.save_npz(f'{adj_path}/sparse_matrix%04d.npz' % (i, ), drug_drug_adj_list[i].tocoo()) # Number of connections for each drug drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] self.adj_mats = { (0, 0): [gene_gene_adj], (0, 1): [gene_drug_adj], (1, 0): [drug_gene_adj], (1, 1): drug_drug_adj_list, } self.degrees = { 0: [gene_degrees], 1: drug_degrees_list, }
for d in range(np.size(T.starts)): II = np.arange(T.starts[d]-1,T.ends[d]-1,1,dtype='int') ni = T.ni[II] ti = T.date[II] mi = T.monthi[II] for i in np.arange(np.size(ni)-1) : j = i+dm if j < np.size(ni) and j>0 : if np.absolute(ti[j]-ti[i]).astype(int) == M.tau : if ni[i] > -9900 and ni[j] > -9900: #PROBLEM: so far no solution for crossing paths between regions as defined at starts. Buoys crossing the boundary will be rejected. J.indi[mi[i]][tel[mi[i]]] = ni[i] J.indj[mi[i]][tel[mi[i]]] = ni[j] tel[mi[i]] += 1 M.nCross = [[] for _ in np.arange(M.nt) ] J.indicrop = [[] for _ in np.arange(M.nt) ] J.indjcrop = [[] for _ in np.arange(M.nt) ] #delete empty entries in indi and indj for m in np.arange(M.nt): J.indicrop[m] = J.indi[m][0:tel[m]] J.indjcrop[m] = J.indj[m][0:tel[m]] #sparse matrix per timestep with every origin and destination as coordinates in matrix, with all possible locations M.nc on i and j axis. sparseV = np.ones(np.shape(J.indicrop[m])) sparseI = J.indicrop[m] sparseJ = J.indjcrop[m] M.nCross[m] = sparse.coo_matrix((sparseV,(sparseI,sparseJ)),shape=(M.nc[-1],M.nc[-1])) sparse.save_npz(os.path.join('rawMatrices/',str(types[tp]) + 'transitmatrix_' + M.dir + '_raw_' + str(f) + '_monthindex' + str(m) + '.npz'), M.nCross[m]) print("--- %s seconds ---" % (time.time() - start_time))
train = pd.read_csv(Settings.train_cleaned_file_path) test = pd.read_csv(Settings.test_cleaned_file_path) # features = generate_doc_vec() pd.set_option('display.max_rows', 20000) train['comment_text'].fillna('null', inplace=True) test['comment_text'].fillna('null', inplace=True) merge = pd.concat([train.iloc[:, 0:2], test.iloc[:, 0:2]]) corpus = merge.comment_text tfidf_word = TfidfVectorizer(ngram_range=(1, 2), strip_accents="unicode", min_df=3, max_df=0.95, use_idf=True, smooth_idf=True, sublinear_tf=True, analyzer='word', max_features=10000) # tfidf_word.fit(corpus) word_tfidf_vec = tfidf_word.fit_transform(corpus) tfidf_char = TfidfVectorizer(ngram_range=(4, 6), strip_accents="unicode", analyzer='char', sublinear_tf=True, use_idf=True, smooth_idf=True, max_features=20000) # tfidf_char.fit(corpus) char_tfidf_vec = tfidf_char.fit_transform(corpus) # final_VSM = sparse.hstack((word_tfidf_vec, char_tfidf_vec, features), format('csr')) final_VSM = sparse.hstack((word_tfidf_vec, char_tfidf_vec), format('csr')) sparse.save_npz(features_file, final_VSM) elapsed = time.time() - start print("time for generate VSM: ", elapsed, "\n")
def to_disk(self, file_path): file_name, ext = os.path.splitext(file_path) save_npz(file_path, self.raw_data) with open(file_name + ".voc", "wb") as vocab_file: pickle.dump(self.vectorizer, vocab_file) self.identifiers.to_pickle(file_name + ".pkl")
compute_correlation(events, 250, num_threads, chunk_size, threshold, n_classes_hist) diff_time = time.time() - start print("Execution finished in: " + str(diff_time)) tiempos[i] = diff_time print("Max: " + str(tiempos.max()) + " Mean: " + str(tiempos.mean()) + " Min: " + str(tiempos.min())) xcm_pos = csr_matrix(xcm_pos, dtype=np.float32) xclags_pos = csr_matrix(xclags_pos, dtype=np.int32) xcm_neg = csr_matrix(xcm_neg, dtype=np.float32) xclags_neg = csr_matrix(xclags_neg, dtype=np.int32) max_hist = csr_matrix(max_hist, dtype=np.int32) min_hist = csr_matrix(min_hist, dtype=np.int32) save_npz('out_xcm_pos.npz', xcm_pos, compressed=True) save_npz('out_xcl_pos.npz', xclags_pos, compressed=True) save_npz('out_xcm_neg.npz', xcm_neg, compressed=True) save_npz('out_xcl_neg.npz', xclags_neg, compressed=True) save_npz('out_xmax_hist.npz', max_hist, compressed=True) save_npz('out_xmin_hist.npz', min_hist, compressed=True) # xcm_dense = xcm_pos.toarray() # print(xcm_dense.shape) #np.savez_compressed('output_matrices', xcm_pos=xcm_pos, xcl_pos=xclags_pos, xcm_neg=xcm_neg, xcl_neg=xclags_neg) # np.savetxt('xcm_v1_neg.txt', xcm_neg, delimiter='\t', fmt='%6.3f') # # np.savetxt('xcl_v1_neg.txt', xclags_neg, delimiter='\t', fmt='%6.0f') #
def save(self, vector_data_file): sparse.save_npz("%s-user" % vector_data_file, self._user_features) sparse.save_npz("%s-item" % vector_data_file, self._item_features)
def save_sp(folder, name, M): return sp.save_npz(folder+name+'_sp.npz', M.tocsr())
def __init__(self, TOKENIZED_CORPUS, UNIQUE_WORD_LIST, EMBED_SIZE, CONTEXT_SIZE, X_MAX, ALPHA, TOTAL_PROCESS_NUM): """ This method initialize GloVeClass with given parameters. Args: TOKENIZED_CORPUS(list) : list of all words in a corpus UNIQUE_WORD_LIST(ndarray) : list of all unique word EMBED_SIZE : the size of vector CONTEXT_SIZE : context window size X_MAX : maximun x size ALPHA : ALPHA TOTAL_PROCESS_NUM : TOTAL_PROCESS_NUM """ super(GloVeClass, self).__init__() print("[Initialization Start]") self.TOKENIZED_CORPUS = TOKENIZED_CORPUS self.UNIQUE_WORD_LIST = UNIQUE_WORD_LIST self.CONTEXT_SIZE = CONTEXT_SIZE self.EMBED_SIZE = EMBED_SIZE self.X_MAX = X_MAX self.ALPHA = ALPHA self.word_to_index = { word: index for index, word in enumerate(self.UNIQUE_WORD_LIST) } self.index_to_word = { index: word for index, word in enumerate(self.UNIQUE_WORD_LIST) } self.TOKENIZED_CORPUS_SIZE = len(self.TOKENIZED_CORPUS) self.UNIQUE_WORD_SIZE = len(self.UNIQUE_WORD_LIST) self.in_embed = nn.Embedding(self.UNIQUE_WORD_SIZE, self.EMBED_SIZE) self.in_embed.weight = xavier_normal(self.in_embed.weight) self.in_bias = nn.Embedding(self.UNIQUE_WORD_SIZE, 1) self.in_bias.weight = xavier_normal(self.in_bias.weight) self.out_embed = nn.Embedding(self.UNIQUE_WORD_SIZE, self.EMBED_SIZE) self.out_embed.weight = xavier_normal(self.out_embed.weight) self.out_bias = nn.Embedding(self.UNIQUE_WORD_SIZE, 1) self.out_bias.weight = xavier_normal(self.out_bias.weight) self.word_embeddings_array = None self.word_u_candidate = np.arange(self.UNIQUE_WORD_SIZE) self.word_v_candidate = np.arange(self.UNIQUE_WORD_SIZE) self.total_process_num = TOTAL_PROCESS_NUM if TOTAL_PROCESS_NUM: print("Build co-occurence matrix with multiprocess") print("TOTAL_PROCESS_NUM : ", TOTAL_PROCESS_NUM) queue = mp.Queue() ps = list() for i in range(self.total_process_num): ps.append( mp.Process(target=self.build_sub_co_occurence_matrix, args=(queue, i))) for p in ps: p.start() # キューから結果を回収 for i in range(self.total_process_num): if i: col += queue.get() # キューに値が無い場合は、値が入るまで待機になる else: col = queue.get() for p in ps: p.terminate() col = np.array(col, dtype=np.int64) self.co_occurence_matrix = coo_matrix( (np.ones(col.size, dtype=np.int64), (np.zeros(col.size, dtype=np.int64), col)), shape=(1, int((self.UNIQUE_WORD_SIZE * (self.UNIQUE_WORD_SIZE + 1)) / 2)), dtype=np.int64) print("Done") tries = 10 while tries: try: print("SAVE co_occurence_matrix") # scipy.io.mmwrite('model/co_occurence_matrix.mtx', self.co_occurence_matrix) save_npz('model/co_occurence_matrix.npz', self.co_occurence_matrix) print("Done") except IOError as e: print("IOError happened") error = e tries -= 1 else: break if not tries: print("Fail to saving matrix due to IOError") raise error else: print("Load co-occurence matrix") # self.co_occurence_matrix = scipy.io.mmread('model/co_occurence_matrix.mtx') self.co_occurence_matrix = load_npz( 'model/co_occurence_matrix.npz') print("Done") self.co_occurence_matrix = self.co_occurence_matrix.todense() print("[Initialization Done]")
print(aug) if args.mode == 'val': val_df = pd.read_csv(str(data_path / 'val4_df.csv')) preds, labels = predict(model, val_df['file_name'].apply(Path).values, batch_size, aug=aug) target_file_name = args.model_type + '_test_' + str(aug) elif args.mode == 'test': test_hashes = pd.read_csv(str(data_path / 'test_hashes.csv')) train_hashes = pd.read_csv(str(data_path / 'train_hashes.csv')) test_hashes = test_hashes.drop_duplicates('md5') test_hashes = test_hashes[~test_hashes['md5'].isin(set(train_hashes['md5'].unique()))] bad_md5 = ['d704b9555801285eedb04213a02fdc41', '35e7e038fe2ec215f63bdb5e4b739524'] hashes = test_hashes['file_name'].apply(lambda x: data_path.parent / x, 1).values preds, labels = predict(model, hashes, batch_size, aug=aug, transform=transform) target_file_name = args.model_type + '_test_' + str(aug) labels = pd.DataFrame(labels, columns=['file_name']) print('[{}] Saving labels...'.format(str(datetime.datetime.now()))) labels.to_csv(str(model_path / (target_file_name + '.csv')), index=False) print('[{}] Saving predictions...'.format(str(datetime.datetime.now()))) save_npz(str(model_path / (target_file_name + '.npz')), preds)