import glob,os from fastq_reader import Fastq_Reader help_message = 'usage example: python check_hash_collisions.py -r 2 -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["--filerank=","inputdir=","outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-r',"--filerank"): fr = int(arg) elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = Fastq_Reader(inputdir,outputdir) HashFiles = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*')) hashobject.infile = HashFiles[fr] t,n,H = hashobject.collision_report() print 'total hashed kmers:',t print 'total pairwise collisions:',n print 'collision histogram:',H
if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-r',"--filerank"): fr = int(arg)-1 elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = Fastq_Reader(inputdir,outputdir) Hashq_Files = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*')) hashobject.infile = Hashq_Files[fr] hashobject.outfile = hashobject.output_path + 'intermediate_clusters/' + str(fr) hashobject.global_weights = np.load(hashobject.output_path + 'global_weights.npy') global_weight_sum = hashobject.global_weights.sum(dtype=np.float64) Cluster_Files = glob.glob(os.path.join(hashobject.output_path,'*.cluster.npy')) Cluster_Files = [(int(cf[cf.rfind('/')+1:cf.index('.')]),cf) for cf in Cluster_Files] cluster_sizes = np.load(hashobject.output_path+'kmer_cluster_sizes.npy') total_set_size = 0 cluster_weights = [] cluster_keys = [] outpart = 0 for ci,cf in Cluster_Files: # ignore super clusters and super small clusters if cluster_sizes[ci] < 0.2*2**hashobject.hash_size: cw = np.load(cf) cw_sum_prob = hashobject.global_weights[cw].sum(dtype=np.float64)/global_weight_sum