return args # MAIN if __name__ == "__main__": args = interface() input_dir = os.path.abspath(args.IN) if not input_dir.endswith('/'): input_dir += '/' output_dir = os.path.abspath(args.OUT) if not output_dir.endswith('/'): output_dir += '/' thresh = args.threshold hashobject = StreamingEigenhashes(input_dir, output_dir, get_pool=-1) Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path, '*.count.hash.conditioned')) hashobject.path_dict = {} for i in range(len(Kmer_Hash_Count_Files)): hashobject.path_dict[i] = Kmer_Hash_Count_Files[i] lsi = models.LsiModel.load(hashobject.output_path + 'kmer_lsi.gensim') hashobject.cluster_thresh = thresh Index = hashobject.lsi_cluster_index(lsi) np.save(hashobject.output_path + 'cluster_index.npy', Index) print('Cluster index has shape: ' + str(Index.shape)) with open(hashobject.output_path + 'numClusters.txt', 'w') as f: f.write('{0}\n'.format(Index.shape[0]))
import numpy as np from streaming_eigenhashes import StreamingEigenhashes help_message = 'usage example: python kmer_clusters.py -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:], 'hi:o:', ["inputdir=", "outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=False) Kmer_Hash_Count_Files = glob.glob( os.path.join(hashobject.input_path, '*.nonzero.npy')) corpus_generator = hashobject.corpus_idf_from_hash_paths( Kmer_Hash_Count_Files) hashobject.train_tfidf(corpus_generator) np.save(hashobject.output_path + 'global_weights.npy', hashobject.global_weights)
if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-p', '--numproc'): num_proc = int(arg) elif opt in ('-s', '--single'): singleInstance = True ### use -p option for multiprocessing num_proc = -1 ### hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=num_proc) Kmer_Hash_Count_Files = glob.glob( os.path.join(hashobject.input_path, '*.count.hash.conditioned')) hashobject.path_dict = {} for i in range(len(Kmer_Hash_Count_Files)): hashobject.path_dict[i] = Kmer_Hash_Count_Files[i] corpus = hashobject.kmer_corpus_from_disk() # This is a hack. Should do a better job chosing num_dims lsi = hashobject.train_kmer_lsi(corpus, num_dims=len(hashobject.path_dict) * 4 / 5, single=singleInstance) lsi.save(hashobject.output_path + 'kmer_lsi.gensim')