import glob, os import numpy as np from streaming_eigenhashes import StreamingEigenhashes help_message = "usage example: python kmer_corpus.py -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/" if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:], "hi:o:r:", ["inputdir=", "outputdir=", "filerank="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): print help_message sys.exit() elif opt in ("-r", "--filerank"): fr = int(arg) - 1 elif opt in ("-i", "--inputdir"): inputdir = arg if inputdir[-1] != "/": inputdir += "/" elif opt in ("-o", "--outputdir"): outputdir = arg if outputdir[-1] != "/": outputdir += "/" hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=False) Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path, "*.count.hash")) # M = np.load(hashobject.input_path+'column_mask.npy') M = [] hashobject.kmer_corpus_to_disk(Kmer_Hash_Count_Files[fr], mask=M)
import numpy as np from streaming_eigenhashes import StreamingEigenhashes help_message = 'usage example: python kmer_clusters.py -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:], 'hi:o:', ["inputdir=", "outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=False) Kmer_Hash_Count_Files = glob.glob( os.path.join(hashobject.input_path, '*.nonzero.npy')) corpus_generator = hashobject.corpus_idf_from_hash_paths( Kmer_Hash_Count_Files) hashobject.train_tfidf(corpus_generator) np.save(hashobject.output_path + 'global_weights.npy', hashobject.global_weights)
except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=-1) FP = glob.glob(os.path.join(hashobject.output_path, '*.cluster.npy')) FP = [(int(fp[fp.rfind('/') + 1:fp.index('.cluster')]), fp) for fp in FP] I = np.load(hashobject.output_path + 'cluster_index.npy') I = I.shape[0] cluster_sizes = np.zeros(I, dtype=np.uint64) GW = np.load(hashobject.output_path + 'global_weights.npy') global_weight_sum = GW.sum(dtype=np.float64) CP = np.zeros(I) X = np.zeros((2**hashobject.hash_size, 5), dtype=np.int16) Ix = np.zeros(2**hashobject.hash_size, dtype=np.int8) for i, fp in FP: c = np.load(fp) CP[i] = GW[c].sum(dtype=np.float64) / global_weight_sum cluster_sizes[i] = c.shape[0] X[c, Ix[c]] = i + 1
return args # MAIN if __name__ == "__main__": args = interface() input_dir = os.path.abspath(args.IN) if not input_dir.endswith('/'): input_dir += '/' output_dir = os.path.abspath(args.OUT) if not output_dir.endswith('/'): output_dir += '/' thresh = args.threshold hashobject = StreamingEigenhashes(input_dir, output_dir, get_pool=-1) Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path, '*.count.hash.conditioned')) hashobject.path_dict = {} for i in range(len(Kmer_Hash_Count_Files)): hashobject.path_dict[i] = Kmer_Hash_Count_Files[i] lsi = models.LsiModel.load(hashobject.output_path + 'kmer_lsi.gensim') hashobject.cluster_thresh = thresh Index = hashobject.lsi_cluster_index(lsi) np.save(hashobject.output_path + 'cluster_index.npy', Index) print('Cluster index has shape: ' + str(Index.shape)) with open(hashobject.output_path + 'numClusters.txt', 'w') as f: f.write('{0}\n'.format(Index.shape[0]))
dest='task_rank', type=int, metavar='<task_rank>', help='The rank of the currant task.') args = parser.parse_args() return args if __name__ == "__main__": args = interface() input_dir = os.path.abspath(args.IN) if not input_dir.endswith('/'): input_dir += '/' output_dir = os.path.abspath(args.OUT) if not output_dir.endswith('/'): output_dir += '/' task_rank = args.task_rank - 1 hashobject = StreamingEigenhashes(input_dir, output_dir, get_pool=False) Kmer_Hash_Count_Files = glob.glob( os.path.join(hashobject.input_path, '*.count.hash')) # M = np.load(hashobject.input_path + 'column_mask.npy') M = [] print("[KmerCorpus] Computing kmer corpus.") hashobject.kmer_corpus_to_disk(Kmer_Hash_Count_Files[task_rank], mask=M) print("[KmerCorpus] Done.")
import sys, getopt import glob, os from gensim import corpora import numpy as np from streaming_eigenhashes import StreamingEigenhashes help_message = 'usage example: python kmer_clusters.py -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:],'hi:o:',["inputdir=","outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = StreamingEigenhashes(inputdir,outputdir,get_pool=False) Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path,'*.nonzero.npy')) corpus_generator = hashobject.corpus_idf_from_hash_paths(Kmer_Hash_Count_Files) hashobject.train_tfidf(corpus_generator) np.save(hashobject.output_path+'global_weights.npy',hashobject.global_weights)
if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-r', '--filerank'): fr = int(arg) - 1 elif opt in ('-t', '--thresh'): thresh = float(arg) hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=-1) Kmer_Hash_Count_Files = glob.glob( os.path.join(hashobject.input_path, '*.count.hash.conditioned')) hashobject.path_dict = {} for i in range(len(Kmer_Hash_Count_Files)): hashobject.path_dict[i] = Kmer_Hash_Count_Files[i] lsi = models.LsiModel.load(hashobject.output_path + 'kmer_lsi.gensim') Index = np.load(hashobject.output_path + 'cluster_index.npy') i = fr * 10**6 o = (i, min(10**6, 2**hashobject.hash_size - i)) hashobject.cluster_thresh = thresh Ci = hashobject.lsi_cluster_part(o, lsi, Index) for ci, c in enumerate(Ci): try: np.save(hashobject.output_path + str(ci) + '/' + str(fr) + '.npy', c)
if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-p', '--numproc'): num_proc = int(arg) elif opt in ('-s', '--single'): singleInstance = True ### use -p option for multiprocessing num_proc = -1 ### hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=num_proc) Kmer_Hash_Count_Files = glob.glob( os.path.join(hashobject.input_path, '*.count.hash.conditioned')) hashobject.path_dict = {} for i in range(len(Kmer_Hash_Count_Files)): hashobject.path_dict[i] = Kmer_Hash_Count_Files[i] corpus = hashobject.kmer_corpus_from_disk() # This is a hack. Should do a better job chosing num_dims lsi = hashobject.train_kmer_lsi(corpus, num_dims=len(hashobject.path_dict) * 4 / 5, single=singleInstance) lsi.save(hashobject.output_path + 'kmer_lsi.gensim')
sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): print help_message sys.exit() elif opt in ("-i", "--inputdir"): inputdir = arg if inputdir[-1] != "/": inputdir += "/" elif opt in ("-o", "--outputdir"): outputdir = arg if outputdir[-1] != "/": outputdir += "/" elif opt in ("-r", "--filerank"): fr = int(arg) - 1 elif opt in ("-t", "--thresh"): thresh = float(arg) hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=-1) Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path, "*.count.hash.conditioned")) hashobject.path_dict = {} for i in range(len(Kmer_Hash_Count_Files)): hashobject.path_dict[i] = Kmer_Hash_Count_Files[i] lsi = models.LsiModel.load(hashobject.output_path + "kmer_lsi.gensim") hashobject.cluster_thresh = thresh Index = hashobject.lsi_cluster_index(lsi) np.save(hashobject.output_path + "cluster_index.npy", Index) print "cluster index has shape:", Index.shape f = open(hashobject.output_path + "numClusters.txt", "w") f.write("%d\n" % Index.shape[0]) f.close()
if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-r','--filerank'): fr = int(arg) - 1 elif opt in ('-t','--thresh'): thresh = float(arg) hashobject = StreamingEigenhashes(inputdir,outputdir,get_pool=-1) Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path,'*.count.hash.conditioned')) hashobject.path_dict = {} for i in range(len(Kmer_Hash_Count_Files)): hashobject.path_dict[i] = Kmer_Hash_Count_Files[i] lsi = models.LsiModel.load(hashobject.output_path+'kmer_lsi.gensim') Index = np.load(hashobject.output_path+'cluster_index.npy') i = fr*10**6 o = (i,min(10**6,2**hashobject.hash_size-i)) hashobject.cluster_thresh = thresh Ci = hashobject.lsi_cluster_part(o,lsi,Index) for ci,c in enumerate(Ci): try: np.save(hashobject.output_path+str(ci)+'/'+str(fr)+'.npy',c) except IOError: os.system('mkdir '+hashobject.output_path+str(ci))
print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-p','--numproc'): num_proc = int(arg) elif opt in ('-s','--single'): singleInstance = True ### use -p option for multiprocessing num_proc = -1 ### hashobject = StreamingEigenhashes(inputdir,outputdir,get_pool=num_proc) Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path,'*.count.hash.conditioned')) hashobject.path_dict = {} for i in range(len(Kmer_Hash_Count_Files)): hashobject.path_dict[i] = Kmer_Hash_Count_Files[i] corpus = hashobject.kmer_corpus_from_disk() # This is a hack. Should do a better job chosing num_dims lsi = hashobject.train_kmer_lsi(corpus,num_dims=len(hashobject.path_dict)*4/5,single=singleInstance) lsi.save(hashobject.output_path+'kmer_lsi.gensim')