return args


# MAIN
if __name__ == "__main__":
    args = interface()

    input_dir = os.path.abspath(args.IN)
    if not input_dir.endswith('/'):
        input_dir += '/'

    output_dir = os.path.abspath(args.OUT)
    if not output_dir.endswith('/'):
        output_dir += '/'

    thresh = args.threshold

    hashobject = StreamingEigenhashes(input_dir, output_dir, get_pool=-1)
    Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path, '*.count.hash.conditioned'))
    
    hashobject.path_dict = {}
    for i in range(len(Kmer_Hash_Count_Files)):
        hashobject.path_dict[i] = Kmer_Hash_Count_Files[i]
    
    lsi = models.LsiModel.load(hashobject.output_path + 'kmer_lsi.gensim')
    hashobject.cluster_thresh = thresh
    Index = hashobject.lsi_cluster_index(lsi)
    np.save(hashobject.output_path + 'cluster_index.npy', Index)
    print('Cluster index has shape: ' + str(Index.shape))
    with open(hashobject.output_path + 'numClusters.txt', 'w') as f:
       f.write('{0}\n'.format(Index.shape[0]))
예제 #2
0
import numpy as np
from streaming_eigenhashes import StreamingEigenhashes

help_message = 'usage example: python kmer_clusters.py -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/'
if __name__ == "__main__":
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hi:o:',
                                   ["inputdir=", "outputdir="])
    except:
        print help_message
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print help_message
            sys.exit()
        elif opt in ('-i', '--inputdir'):
            inputdir = arg
            if inputdir[-1] != '/':
                inputdir += '/'
        elif opt in ('-o', '--outputdir'):
            outputdir = arg
            if outputdir[-1] != '/':
                outputdir += '/'
    hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=False)
    Kmer_Hash_Count_Files = glob.glob(
        os.path.join(hashobject.input_path, '*.nonzero.npy'))
    corpus_generator = hashobject.corpus_idf_from_hash_paths(
        Kmer_Hash_Count_Files)
    hashobject.train_tfidf(corpus_generator)
    np.save(hashobject.output_path + 'global_weights.npy',
            hashobject.global_weights)
예제 #3
0
        if opt in ('-h', '--help'):
            print help_message
            sys.exit()
        elif opt in ('-i', '--inputdir'):
            inputdir = arg
            if inputdir[-1] != '/':
                inputdir += '/'
        elif opt in ('-o', '--outputdir'):
            outputdir = arg
            if outputdir[-1] != '/':
                outputdir += '/'
        elif opt in ('-p', '--numproc'):
            num_proc = int(arg)
        elif opt in ('-s', '--single'):
            singleInstance = True
    ### use -p option for multiprocessing
    num_proc = -1
    ###
    hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=num_proc)
    Kmer_Hash_Count_Files = glob.glob(
        os.path.join(hashobject.input_path, '*.count.hash.conditioned'))
    hashobject.path_dict = {}
    for i in range(len(Kmer_Hash_Count_Files)):
        hashobject.path_dict[i] = Kmer_Hash_Count_Files[i]
    corpus = hashobject.kmer_corpus_from_disk()
    # This is a hack. Should do a better job chosing num_dims
    lsi = hashobject.train_kmer_lsi(corpus,
                                    num_dims=len(hashobject.path_dict) * 4 / 5,
                                    single=singleInstance)
    lsi.save(hashobject.output_path + 'kmer_lsi.gensim')