Exemplo n.º 1
0
def divide_dataset(allFilePath, dateset_rate):
    all_FileName = get_filename(allFilePath)
    # print(allFileName)
    # print(type(allFileName))
    # 判断是否已经进行拆分,已拆分就直接调用数据
    dataFilePath = './mid_data'
    trainSetFile = 'trainSet.npy'
    testSetFile = 'testSet.npy'
    state_trainSet = checkFile(dataFilePath, trainSetFile)
    state_testSet = checkFile(dataFilePath, testSetFile)
    if state_trainSet and state_testSet:
        print("===================LoadFile=========================\n")
        trainSet = np.load("./mid_data/trainSet.npy")
        testSet = np.load("./mid_data/testSet.npy")
    else:
        # 按比率拆分成两个列表:训练集、测试集
        allFileName = np.array(all_FileName)
        fileNum = allFileName.shape[0]
        trainNum = int(fileNum * dateset_rate)
        trainSetIndex = np.random.choice(fileNum, trainNum, replace=False)
        trainSet = allFileName[trainSetIndex]  # 训练集
        testSetIndex = np.arange(fileNum)
        testSetIndex = np.delete(testSetIndex, trainSetIndex)
        testSet = allFileName[testSetIndex]
        # 保存拆分后的数据
        np.save("./mid_data/trainSet.npy", trainSet)
        np.save("./mid_data/testSet.npy", testSet)
    return trainSet, testSet
Exemplo n.º 2
0
def readOptAp(argv,  outputdir, selectodir, peakfile, annofile, peakcaller, prefix, graph):
	try:                                
	    opts, args = getopt.getopt(argv, "hf:a:o:", ["help", "plot", "name="]) # List of all short and long options possible
	except getopt.GetoptError:
	    usageAp()
	    sys.exit(2)
	for opt, arg in opts:
		if opt in ("-h", "--help"): # Print usage message
			usageAp()
			sys.exit(0)
		elif opt == '-f': # For replicate 1 file
			checkFile(arg)
			peakfile = arg
		elif opt == '-a': # For replicate 2 file
			checkBed(arg)
			annofile = arg
		elif opt == '-o': # Output directory is specified
			if not checkPath(arg):
				print "Error, the specified path '"+arg+"' does not exists"
				sys.exit(1)
			outputdir = arg
			selectodir = 'true'
		elif opt == '--name': # Prefix to give to output file
			prefix = arg
		elif opt == '--plot':
			graph = 'ON'
	return   outputdir, selectodir, peakfile, annofile, peakcaller, prefix, graph
Exemplo n.º 3
0
def readOptCa(argv, outputdir, selectodir, filterqual, unmapped, filtercoord, indexGenome, rmvdup, sorting, indexBam, coordinatefile, prefix, genome, minqual, fastqfile, fastqfile1, fastqfile2, seq):
	try:                                
	    opts, args = getopt.getopt(argv, "hf:1:2:g:o:q:FL:", ["help", "index", "rmdup", "sort", "bamIndex", "name="]) # List of all short and long options possible
	except getopt.GetoptError:
	    usageCa()
	    sys.exit(2)
	for opt, arg in opts:
		if opt in ("-h", "--help"): # Print usage message
			usageCa()
			sys.exit(0)
		elif opt == '-f': # Means that just one SE fastq file is given
			checkFastq(arg)
			fastqfile = arg
			seq = 'SE'
		elif opt == '-1': # Means that R1 PE fastq file is given
			checkFastq(arg)
			fastqfile1 = arg
			seq = 'PE'
		elif opt == '-2': # Means that R2 PE fastq file is given
			checkFastq(arg)
			fastqfile2 = arg
		elif opt == '-g': # genome prefix is given
			checkGenome(arg)
			genome = arg
		elif opt == '-o': # Output directory is specified
			if not checkPath(arg):
				print "Error, the specified path '"+arg+"' does not exists"
				sys.exit(1)
			outputdir = arg
			selectodir = 'true'
		elif opt == '-q': # Filter out reads with low quality
			minqual = arg # quality threshold
			filterqual = 'ON'
		elif opt == '-F': # Filter out unmapped reads
			unmapped = 'ON'
		elif opt == '-L': # Filter out reads in specific regions (blacklist)
			checkFile(arg) 
			coordinatefile = arg # file name of blacklist regions
			filtercoord = 'ON'
		elif opt == '--index': # Indexing genome before alignment
			indexGenome = 'ON'
		elif opt == '--rmdup': # Filter out PCR duplicates and non uniquely mappable reads 
			rmvdup = 'ON'
		elif opt == '--sort': # Sorting final bam file
			sorting = 'ON'
		elif opt == '--bamIndex': # Indexing final bam file
			indexBam = 'ON'
		elif opt == '--name': # Prefix to give to output file
			prefix = arg
	return outputdir, selectodir, filterqual, unmapped, filtercoord, indexGenome, rmvdup, sorting, indexBam, coordinatefile, prefix, genome, minqual, fastqfile, fastqfile1, fastqfile2, seq
Exemplo n.º 4
0
def get_allFileLabel(allFilePath, dateset_rate):
    # 如果进行过运算,并且保留有json格式的文件,就直接加载
    dictFilePath = './mid_data'
    trainSetFileName = 'trainSetLabel.json'
    testSetFileName = 'testSetLabel.json'
    trainSetFileState = checkFile(dictFilePath, trainSetFileName)
    testSetFileState = checkFile(dictFilePath, testSetFileName)
    if trainSetFileState and testSetFileState:
        # 加载训练集的JSON格式数据并转换为字典
        with open('./mid_data/trainSetLabel.json', 'r', encoding='UTF-8') as f:
            trainSetLabel = json.load(f)
        # 加载测试集的JSON格式数据并转换为字典
        with open('./mid_data/testSetLabel.json', 'r', encoding='UTF-8') as f:
            testSetLabel = json.load(f)
    else:
        # 调用函数传入划分后的训练集合;
        trainSetFile, testSetFile = divide_dataset(allFilePath, dateset_rate)
        # 训练集
        trainSetLabel = {}  # 存放文件名和文件标记的字典
        for i in trainSetFile:
            j = i.upper()  # 将所有文件名转为大写(只是为了提取标签,不改变绝对路径)
            code = get_mark(j)  # 将标签提取出来,用字符串表示
            # 构建嵌套字典存储
            d1_trainSet = trainSetLabel.setdefault(i, {})
            # 将标签标记的3个参数分离,分别存入嵌套字典
            d2_trainSet = d1_trainSet.setdefault('corb', int(code[0]))
            d2_trainSet = d1_trainSet.setdefault('torf', int(code[1]))
            d2_trainSet = d1_trainSet.setdefault('yorn', int(code[2]))
            # 将字典数据存入json格式文件
            jsonFile = json.dumps(trainSetLabel)
            with open('./mid_data/trainSetLabel.json', 'w') as file:
                file.write(jsonFile)
        # 测试集
        testSetLabel = {}  # 存放文件名和文件标记的字典
        for i in testSetFile:
            j = i.upper()  # 将所有文件名转为大写(只是为了提取标签,不改变绝对路径)
            code = get_mark(j)  # 将标签提取出来,用字符串表示
            # 构建嵌套字典存储
            d1_testSet = testSetLabel.setdefault(i, {})
            # 将标签标记的3个参数分离,分别存入嵌套字典
            d2_testSet = d1_testSet.setdefault('corb', int(code[0]))
            d2_testSet = d1_testSet.setdefault('torf', int(code[1]))
            d2_testSet = d1_testSet.setdefault('yorn', int(code[2]))
            # 将字典数据存入json格式文件
            jsonFile = json.dumps(testSetLabel)
            with open('./mid_data/testSetLabel.json', 'w') as file:
                file.write(jsonFile)
    return trainSetLabel, testSetLabel

    pass
Exemplo n.º 5
0
def uploadFile(fileHash, fileContent, username, backend, fileName):
    '''Uploads a file to the dropbox for online.
    '''

    logging.debug('dropBox::uploadFile(%s, %s [len], %s, %s, %s)', fileHash, len(fileContent), username, backend, fileName)

    logging.info('uploadFile(): Checking whether the hash is valid...')
    checkHash(fileHash)

    logging.info('uploadFile(): %s: Checking the file content hash...', fileHash)
    fileContentHash = getHash(fileContent)
    if fileHash != fileContentHash:
        raise DropBoxError('The given file hash %s does not match with the file content hash %s.' % (fileHash, fileContentHash))

    logging.info('uploadFile(): %s: Checking whether the file already exists...', fileHash)
    state = dataAccess.getFileState(fileHash)

    if state == 'Uploaded':
        raise DropBoxError('The uploaded file with hash %s already exists in the Uploaded files (i.e. not yet processed). This probably means that you sent the same request twice in a short time.' % fileHash)

    if state == 'Pending':
        raise DropBoxError('The uploaded file with hash %s already exists in the Pending files (i.e. files that are waiting to be pulled by online that were already checked). This probably means that you sent the same request twice in a short time.' % fileHash)

    if state == 'Acknowledged':
        raise DropBoxError('The uploaded file with hash %s already exists in the Acknowledged files (i.e. files that were already pulled by online not too long ago -- we do not keep all of them forever). This probably means that you sent the same request twice after some time.' % fileHash)

    if state == 'Bad':
        raise DropBoxError('The uploaded file with hash %s already exists in the Bad files (i.e. files that were wrong for some reason). Therefore this file will be skipped since the results of the checks should be the same again (i.e. wrong).' % fileHash)

    logging.info('uploadFile(): %s: Saving the uploaded file in the database...', fileHash)
    dataAccess.insertFile(fileHash, 'Uploaded', backend, username, fileName, fileContent)

    logging.info('uploadFile(): %s: Checking the contents of the file...', fileHash)
    try:
        metadata = check.checkFile(fileHash, fileContent, backend)
    except DropBoxError as e:
        failUpload(fileHash)
        raise e
    except Exception as e:
        # Other kind of exception: this is a bug :(
        alarm.alarm('Non-DropBoxError exception raised in check.py: %s' % e)
        failUpload(fileHash)
        raise DropBoxError('Oops, something went wrong while checking your file. This is most likely a bug in the DropBox. %s' % config.notifiedErrorMessage)

    # Divide the metadata in the userText and the real metadata
    userText = metadata['userText']
    metadata['userText'] = ''

    logging.info('uploadFile(): %s: Inserting entry in the fileLog...', fileHash)
    try:
        dataAccess.insertFileLog(fileHash, Constants.WAITING_FOR_START, dumpJson(metadata), dumpJson(userText))
    except cx_Oracle.IntegrityError:
        failUpload(fileHash)
        raise DropBoxError('The uploaded file %s was already requested in the database.' % fileHash)

    logging.info('uploadFile(): %s: Updating state of the file to Pending...', fileHash)
    dataAccess.updateFileState(fileHash, 'Pending')

    logging.info('uploadFile(): %s: The upload was successful.', fileHash)
Exemplo n.º 6
0
def get_mfcc_list(allFilePath, n_mfcc, dateset_rate):
    # 使用划分后的训练集获取语音参数
    trainSetFile, _ = divide_dataset(allFilePath, dateset_rate)
    print(type(trainSetFile))  # <class 'numpy.ndarray'>
    #trainSetFile = trainSetFile.tolist()
    print(type(trainSetFile))
    wavmfcc = {}
    mfcc_list = []  # 存贮所有的特征向量数据,最后的形状应该是 N(=1002)*DCT(=13)*L(=每个的长度)
    max_length = 0  # LL#
    # 判断mfcc_length.csv文件是否存在,存在就删除进行重写(存储这个给人看,看看原始数据是怎样的数据格式)
    dataFilePath = './mid_data'
    dataFileName = 'mfcc_length.csv'
    dataFileState = checkFile(dataFilePath, dataFileName)
    if dataFileState:
        os.remove("./mid_data/mfcc_length.csv")
    for i in trainSetFile:
        print(i)
        y, sr = librosa.load(i)
        # 数据格式为二维矩阵:13维 * N列(长度不一样)
        mfcc = librosa.feature.mfcc(y, sr, n_mfcc=n_mfcc)
        # 写入wav文件的mfcc至字典中
        wavmfcc[i] = mfcc.tolist()
        # 将所有文件对应的mfcc vector按序追加到mfcc_list里面
        mfcc_list.append(wavmfcc[i])
        # 每次迭代更新最大的vector长度
        max_length = max(max_length, len(wavmfcc[i][0]))  # 随机训练集文件的最大长度
        # 导出mfcc_length.csv文件,方便直接查看语音数据特征
        # 字典中的key值即为csv中列名
        dataframe = pandas.DataFrame(
            {
                'filename': i,
                'number': len(wavmfcc[i]),
                'length': len(wavmfcc[i][0])
            },
            index=[0])
        # 将DataFrame存储为csv,index表示是否显示行名,default=True
        # dataframe.to_csv("C:/Users/1/Documents/Dev/Waking/Classification/Classification/data/mid_data/mfcc_length.csv",mode='a',header=None,index=False,sep=',')
        dataframe.to_csv("./mid_data/mfcc_length.csv",
                         mode='a',
                         header=None,
                         index=False,
                         sep=',')
        pass
    return wavmfcc, mfcc_list, max_length
    pass
Exemplo n.º 7
0
def get_opt(filepath, dateset_rate, learnrate, finalvalue):
    # 判断文件夹下是否存在文件,存在就跳过,不存在就保存
    dataFilePath = './mid_data'
    file_X = 'X.npy'
    file_Y = 'Y.npy'
    file_N = 'N.npy'
    state_X = checkFile(dataFilePath, file_X)
    state_Y = checkFile(dataFilePath, file_Y)
    state_N = checkFile(dataFilePath, file_N)
    if state_X == True and state_Y == True and state_N == True:
        print("exist file:", file_X, file_Y, file_N)
        X = np.load("./mid_data/X.npy")
        Y = np.load("./mid_data/Y.npy")
        N = np.load("./mid_data/N.npy")
    else:
        # 参数分别为:训练数据目录,学习率,统计终止值(一般在0.2后Y/N不在分离)
        # 文件需要取到包含子文件夹下的所有wav格式文件
        # 使用随机划分后的训练集
        # filename = get_filename(filepath)
        trainSet, _ = divide_dataset(filepath, dateset_rate)
        filename = trainSet
        y = []
        n = []
        count = 0
        for i in filename:
            f = wave.open(i, 'rb')
            # 测试===》读取的真实路径
            print("===>>>filepath:", i)
            params = f.getparams()
            nchannels, sampwidth, framerate, nframes = params[:4]
            strData = f.readframes(nframes)
            waveData = np.frombuffer(strData, dtype=np.int16)
            waveData = waveData * 1.0 / (max(abs(waveData)))
            # 测试===》
            print("waveData===>", waveData)
            k = 0.0
            l = []

            # 这里需要将文件名转换(小写==》大写),再到get_mark中进行匹配
            turnI = i.upper()
            mark = int(get_mark(turnI)[2])
            print("mark===>>>", mark)
            while k < finalvalue:
                nk = 0
                for j in waveData:  # 遍历音频数据中每个声谱
                    if abs(j) >= k:  # 若绝对值大于k值则计数长度
                        nk = nk + 1
                l.append(nk)  # 横向加入数组
                k = k + learnrate  # learnrate为递增率
            if mark > 0:  # 1标记为“是”
                y.append(l)
            else:  # 0标记为“不是”
                n.append(l)
            count = count + 1
            print(count, i, ',', mark, len(l))
        Y = np.array(y)
        N = np.array(n)
        X = np.arange(0, finalvalue, learnrate)
        np.save('./mid_data/X.npy', X)
        np.save('./mid_data/Y.npy', Y)
        np.save('./mid_data/N.npy', N)
        pass
    # plot the wave gragh
    plt.figure(1)  # 全范围Y,N分布散点图
    for i in range(Y.shape[0]):
        Yy = Y[i, :]
        plt.plot(X, Yy, 'r.')
    for i in range(N.shape[0]):
        Nn = N[i, :]
        plt.plot(X, Nn, 'b.')
    plt.xlabel("k")
    plt.ylabel("N")
    plt.title("YorN distribution")
    plt.show()

    plt.figure(2)  # Y,N统计意义计数图
    ymax = np.amax(Y, 0)
    ymin = np.amin(Y, 0)
    ymean = np.mean(Y, 0)
    nmax = np.amax(N, 0)
    nmin = np.amin(N, 0)
    nmean = np.mean(N, 0)
    plt.subplot(211)  # 最大,最小,最佳概率统计图
    plt.xlabel("k")
    plt.ylabel("N")
    plt.title("Statistical YorN distribution")
    plt.plot(X, ymax, 'g^', linewidth=0.1, label='Ymax')
    plt.plot(X, ymin, 'gv', linewidth=0.1, label='Ymin')
    plt.plot(X, ymean, 'g', linewidth=1, label='Ymean')
    plt.plot(X, nmax, 'r^', linewidth=0.1, label='Nmax')
    plt.plot(X, nmin, 'rv', linewidth=0.1, label='Nmin')
    plt.plot(X, nmean, 'r', linewidth=1, label='Nmean')
    plt.legend(loc='upper right')
    plt.subplot(212)  # 最佳概率统计直观图
    plt.xlabel("k")
    plt.ylabel("N")
    plt.title("Means of YorN distribution")
    plt.plot(X, ymean, 'g', linewidth=1, label='Ymean')
    plt.plot(X, nmean, 'r', linewidth=1, label='Nmean')
    plt.legend(loc='upper right')
    plt.show()

    delta = nmean - ymean
    print(delta.shape)

    ok = 0.0
    md = 0
    for i in X:
        if i > 0.01:  # 控制查找范围,在稀疏的高平频分离无意义
            x = int(i * 1000)
            if md < delta[x]:  # 找到局部最优且最大的长度值
                md = delta[x]
                ok = i  # 返回此刻的横坐标索引 k
    print("ok==>", ok, "md==>", md)
    # 上段程序(300)求得到的坐标为:0.02 9277.826351351352

    # 上段程序(1002个数据)求得到的坐标为:ok==> 0.018000000000000002 md==> 8600.688151394424
    plt.figure(3)  # 差值曲线,求解最优极大值
    plt.xlabel("k")
    plt.ylabel("Difference")
    plt.plot(X, delta, '-', linewidth=1, label='Difference')
    plt.scatter([
        ok,
    ], [
        md,
    ], color='red', linewidth=0.5)
    plt.plot([ok, ok], [0, md],
             50,
             color='blue',
             linestyle="--",
             linewidth=0.7)
    plt.legend(loc='upper right')
    plt.annotate('local maximum D=8600.688151394424',
                 xy=(ok, md),
                 xytext=(0.025, 10000),
                 arrowprops=dict(arrowstyle="->",
                                 connectionstyle="arc3,rad=.2"))
    plt.annotate('optimal k=0.018',
                 xy=(ok, 0),
                 xytext=(0.025, 100),
                 arrowprops=dict(arrowstyle="->",
                                 connectionstyle="arc3,rad=.2"))
    plt.show()

    # 求得最优k取值的Y,N最佳概率分离点
    ox = int(ok * 1000)
    Yo = int(ymean[ox])
    No = int(nmean[ox])
    print("No==>", No, "Yo==>", Yo)
    # 保存一些计算数据,进行判断,有就不保存
    file_ymax = 'ymax.npy'
    file_ymin = 'ymin.npy'
    file_ymean = 'ymean.npy'
    file_nmax = 'nmax.npy'
    file_nmin = 'nmin.npy'
    file_nmean = 'nmean.npy'
    file_delta = 'delta.npy'
    state_ymax = checkFile(dataFilePath, file_ymax)
    state_ymin = checkFile(dataFilePath, file_ymin)
    state_ymean = checkFile(dataFilePath, file_ymean)
    state_nmax = checkFile(dataFilePath, file_nmax)
    state_nmin = checkFile(dataFilePath, file_nmin)
    state_nmean = checkFile(dataFilePath, file_nmean)
    state_delta = checkFile(dataFilePath, file_delta)
    if state_ymax and state_ymin and state_ymean and state_nmax and state_nmin and state_nmean and state_delta:
        print("file exist:", state_ymax, state_ymin, state_ymean, state_nmax,
              state_nmin, state_nmean, state_delta)
    else:
        np.save('./mid_data/ymax.npy', ymax)
        np.save('./mid_data/ymin.npy', ymin)
        np.save('./mid_data/ymean.npy', ymean)
        np.save('./mid_data/nmax.npy', nmax)
        np.save('./mid_data/nmin.npy', nmin)
        np.save('./mid_data/nmean.npy', nmean)
        np.save('./mid_data/delta.npy', delta)
    return ox, Yo, No
Exemplo n.º 8
0
def get_RNN_x(allFilePath, dct, dateset_rate):
    # 查看是否有中间数据文件,有直接加载
    dataFilePath = './mid_data'
    wavmfccFileName = "wavmfcc.json"  #这个其实在这运算没什么用,存储来给人看是怎样的数据
    format_mfcc_listFileName = "format_mfcc_list.npy"
    mfcc_labelsFileName = "mfcc_labels.npy"
    mfcc_length_listFileName = "mfcc_length_list.npy"

    wavmfccFileState = checkFile(dataFilePath, wavmfccFileName)
    formatFileState = checkFile(dataFilePath, format_mfcc_listFileName)
    labelsFileState = checkFile(dataFilePath, mfcc_labelsFileName)
    lengthFileState = checkFile(dataFilePath, mfcc_length_listFileName)
    if wavmfccFileState and format_mfcc_listFileName and labelsFileState and lengthFileState:
        # 都存在,直接加载数据(不加载wavmfcc.json,因为不用它计算)
        format_mfcc_list = numpy.load("./mid_data/format_mfcc_list.npy")
        mfcc_labels = numpy.load("./mid_data/mfcc_labels.npy")
        mfcc_length_list = numpy.load("./mid_data/mfcc_length_list.npy")
    else:
        # 返回的是未进行标准化的数据,文件名*13*N(不等长)
        wavmfcc, mfcc_list, max_length = get_mfcc_list(allFilePath, dct,
                                                       dateset_rate)
        # batch_seqlen
        mfcc_length_list = []
        # 遍历每个文件的特征向量
        for vector_i in mfcc_list:
            # 遍历所有的特征向量长度
            each_length = len(vector_i[0])
            # 求出每个文件对应的每个vector的长度,即batch_seqlen
            # mfcc_list是一个3维数组,第一维代表每个文件,第二维是dct的长度,第三维是dct下的每个向量
            # 所以取所有文件,使用:遍历;访问第一个dct的第一个向量
            # 形状为: FileQuantity*1
            mfcc_length_list.append(each_length)
            # 格式化所有的数据,变成固定维度
            if each_length < max_length:
                # 若是长度不够,按照差值进行扩充,这里先生成补充的单列
                replenish = format_Matrix(max_length - each_length)
                # 对每一个vector进行整体扩充
                for k in vector_i:
                    k += replenish
                    pass
                pass
            pass
        # 求出训练集所有labels的corb值
        # 文件集合
        trainSet, _ = divide_dataset(allFilePath, dateset_rate)
        #fileClass = trainSet.tolist()
        fileClass = trainSet
        # 字典集合
        trainSetLabel, _ = get_allFileLabel(allFilePath, dateset_rate)
        dictClass = trainSetLabel
        # corbLabel列表
        mfcc_labels = get_label_corb(dictClass, fileClass)

        # 保存中间数据,方便加载调用
        # 将标准化后的mfcc参数列表转换为numpy.array
        # 形状是:N(=1002)*DCT(=13)*max_length(训练集中最长的数据)
        format_mfcc_list = numpy.array(mfcc_list)
        # mfcc_labels是保存训练集中所有corb的值(0/1)
        mfcc_labels = numpy.array(mfcc_labels)
        # 形状是:N(训练集中的文件个数)*1 记录的是每个vector的实际长度
        mfcc_length_list = numpy.array(mfcc_length_list)
        # 保存npy文件,方便之后计算用
        numpy.save('./mid_data/format_mfcc_list.npy', format_mfcc_list)
        numpy.save('./mid_data/mfcc_labels.npy', mfcc_labels)
        numpy.save('./mid_data/mfcc_length_list.npy', mfcc_length_list)
        # 将字典数据存入json格式文件
        jsonFile = json.dumps(wavmfcc)
        with open('./mid_data/wavmfcc.json', 'w') as file:
            file.write(jsonFile)
        # 输出字典数据的长度,即字典中键值对的数量
        print("wavmfcc")
        print(len(wavmfcc))
    return format_mfcc_list, mfcc_labels, mfcc_length_list
    pass