예제 #1
0
def getEstimatedCoverage(fastq_files, estimatedGenomeSizeMb, outdir, threads, estimatedMinimumCoverage):
    run_successfully = False
    pass_qc = False
    failing = {}
    failing['sample'] = False

    # Run Estimated Coverage
    estimatedCoverage = None

    # Get number bases for each fastq file
    pool = multiprocessing.Pool(processes=threads)
    for fastq in fastq_files:
        pool.apply_async(countSequencedBases, args=(fastq, outdir,))
    pool.close()
    pool.join()

    numberBases = 0
    file_problems = False
    files = [f for f in os.listdir(outdir) if not f.startswith('.') and os.path.isfile(os.path.join(outdir, f))]
    for file_found in files:
        if file_found.startswith('estimate_coverage.') and file_found.endswith('.pkl'):
            file_path = os.path.join(outdir, file_found)

            if not file_problems:
                run_successfully, bases = utils.extractVariableFromPickle(file_path)
                if run_successfully:
                    numberBases += bases
                else:
                    file_problems = True

            os.remove(file_path)

    if run_successfully:
        estimatedCoverage = numberBases / float(estimatedGenomeSizeMb * 1000000)
        estimatedCoverage = round(estimatedCoverage, 1)

        report_file = os.path.join(outdir, 'coverage_report.txt')
        report = str(estimatedCoverage) + 'x'
        if not os.path.isfile(report_file):
            writer = open(report_file, 'wt')
        else:
            writer = open(report_file, 'at')
        writer.write(report + '\n')
        writer.flush()
        writer.close()

        report = 'Estimated depth coverage: ' + str(estimatedCoverage) + 'x'
        if estimatedCoverage >= estimatedMinimumCoverage:
            pass_qc = True
            print report
        else:
            failing['sample'] = report + ' (lower than ' + str(estimatedMinimumCoverage) + 'x)'
            print failing['sample']

    else:
        failing['sample'] = 'Did not run'
        print failing['sample']

    return run_successfully, pass_qc, failing, estimatedCoverage
예제 #2
0
def sample_coverage(referenceFile, alignment_file, outdir, threads):
    coverage_outdir = os.path.join(outdir, 'samtools_depth', '')
    utils.removeDirectory(coverage_outdir)
    os.makedirs(coverage_outdir)

    sequences = sequenceHeaders(referenceFile)

    pool = multiprocessing.Pool(processes=threads)
    counter = 0
    for sequence in sequences:
        pool.apply_async(get_sequence_coverage,
                         args=(
                             alignment_file,
                             sequence,
                             coverage_outdir,
                             counter,
                         ))
        counter += 1
    pool.close()
    pool.join()

    sample_coverage_no_problems = True
    mean_coverage_data = {}
    files = [
        f for f in os.listdir(coverage_outdir) if not f.startswith('.')
        and os.path.isfile(os.path.join(coverage_outdir, f))
    ]
    for file_found in files:
        if file_found.startswith('coverage.sequence_') and file_found.endswith(
                '.pkl'):
            file_path = os.path.join(coverage_outdir, file_found)

            if sample_coverage_no_problems:
                sequence_to_analyse, run_successfully, problems_found, position, coverage = \
                    utils.extractVariableFromPickle(file_path)
                if run_successfully and not problems_found:
                    mean_coverage_data[sequence_to_analyse] = {
                        'position':
                        position,
                        'coverage':
                        coverage,
                        'mean_coverage':
                        round((float(coverage) / float(position)), 2)
                    }
                else:
                    print(
                        'WARNING: it was not possible to compute coverage information for'
                        ' sequence ' + sequence_to_analyse)
                    sample_coverage_no_problems = False

            os.remove(file_path)

    return sample_coverage_no_problems, mean_coverage_data
예제 #3
0
def runFastQintegrity(fastq_files, threads, outdir):
	pass_qc = True
	failing = {}
	failing['sample'] = False
	not_corruption_found = True

	fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '')
	utils.removeDirectory(fastQintegrity_folder)
	os.mkdir(fastQintegrity_folder)

	pool = multiprocessing.Pool(processes=threads)
	for fastq in fastq_files:
		pool.apply_async(fastQintegrity, args=(fastq, fastQintegrity_folder,))
	pool.close()
	pool.join()

	encoding = {}
	files = [f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f))]
	for file_found in files:
		if file_found.endswith('.pkl'):
			file_run_successfully, file_encoding, min_reads_length, max_reads_length = utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found))
			if file_run_successfully:
				encoding[file_found] = {'file_encoding': file_encoding, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length}
			else:
				failing[os.path.splitext(file_found)[0]] = ['The file is possibly corrupt']
				print os.path.splitext(file_found)[0] + ': the file is possibly corrupt'
		os.remove(os.path.join(fastQintegrity_folder, file_found))

	if len(failing) > 1:
		failing.pop('sample')
		not_corruption_found = False
		pass_qc = False

	min_reads_length, max_reads_length = None, None

	if len(encoding) == 0:
		encoding = None
		print 'It was no possible to determine the FASTQ encodings'
	else:
		min_reads_length, max_reads_length = guess_encoding.determine_min_max_reads_length(encoding)

		if len(set([x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None])) == 1:
			encoding = [x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None][0]
			print 'Fastq quality encoding: ' + str(encoding)
		else:
			print 'It was no possible to determine the FASTQ encodings'
			print 'This was what has been found: ' + str(encoding)
			encoding = None

	utils.removeDirectory(fastQintegrity_folder)

	return not_corruption_found, pass_qc, failing, encoding, min_reads_length, max_reads_length
예제 #4
0
def gather_data_together(data_directory):
    data = {}

    files = [f for f in os.listdir(data_directory) if not f.startswith('.') and os.path.isfile(os.path.join(data_directory, f))]
    for file_found in files:
        if file_found.startswith('encoding.') and file_found.endswith('.pkl'):
            file_path = os.path.join(data_directory, file_found)

            fastq, valid_encodings, min_reads_length, max_reads_length = utils.extractVariableFromPickle(file_path)
            data[fastq] = {'valid_encodings': valid_encodings, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length}

            os.remove(file_path)

    return data
예제 #5
0
def getPickleRunSuccessfully(directory, pickle_prefix):
	run_successfully = True
	read_pickle = False

	files = findFiles(directory, pickle_prefix, '.pkl')
	if files is not None:
		for file_found in files:
			if run_successfully:
				run_successfully = utils.extractVariableFromPickle(file_found)
				read_pickle = True

			os.remove(file_found)

	if not read_pickle:
		run_successfully = False

	return run_successfully
예제 #6
0
파일: pear.py 프로젝트: yemilawal/INNUca
def get_compressed_decompressed_reads(outdir):
    run_successfully = True
    reads = {}

    counter = 0
    files = [f for f in os.listdir(outdir) if not f.startswith('.') and os.path.isfile(os.path.join(outdir, f))]
    for file_found in files:
        if file_found.endswith('.pkl'):
            file_path = os.path.join(outdir, file_found)

            if run_successfully:
                run_successfully, decompressed_file, length_sequence = utils.extractVariableFromPickle(file_path)
                if run_successfully:
                    reads[counter] = [decompressed_file, length_sequence]
                    counter += 1

            os.remove(file_path)

    return run_successfully, [reads[i][0] for i in reads]
예제 #7
0
def runFastQintegrity(fastq_files, threads, outdir):
    failing = {}
    failing['sample'] = False
    not_corruption_found = True

    fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '')
    utils.removeDirectory(fastQintegrity_folder)
    os.mkdir(fastQintegrity_folder)

    pool = multiprocessing.Pool(processes=threads)
    for fastq in fastq_files:
        pool.apply_async(fastQintegrity, args=(
            fastq,
            fastQintegrity_folder,
        ))
    pool.close()
    pool.join()

    files = [
        f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.')
        and os.path.isfile(os.path.join(fastQintegrity_folder, f))
    ]
    for file_found in files:
        if file_found.endswith('.pkl'):
            file_run_successfully = utils.extractVariableFromPickle(
                os.path.join(fastQintegrity_folder, file_found))
            if not file_run_successfully:
                failing[os.path.splitext(file_found)[0]] = [
                    'The file is possibly corrupt'
                ]
                print os.path.splitext(
                    file_found)[0] + ': the file is possibly corrupt'
        os.remove(os.path.join(fastQintegrity_folder, file_found))

    if len(failing) > 1:
        failing.pop('sample')
        not_corruption_found = False

    utils.removeDirectory(fastQintegrity_folder)

    return not_corruption_found, None, failing  # None added for consistency with other steps
예제 #8
0
def sample_coverage(referenceFile, alignment_file, outdir, threads):
    coverage_outdir = os.path.join(outdir, 'samtools_depth', '')
    utils.removeDirectory(coverage_outdir)
    os.makedirs(coverage_outdir)

    sequences = sequenceHeaders(referenceFile)

    pool = multiprocessing.Pool(processes=threads)
    counter = 0
    for sequence in sequences:
        pool.apply_async(get_sequence_coverage, args=(alignment_file, sequence, coverage_outdir, counter,))
        counter += 1
    pool.close()
    pool.join()

    sample_coverage_no_problems = True
    mean_coverage_data = {}
    files = [f for f in os.listdir(coverage_outdir) if
             not f.startswith('.') and os.path.isfile(os.path.join(coverage_outdir, f))]
    for file_found in files:
        if file_found.startswith('coverage.sequence_') and file_found.endswith('.pkl'):
            file_path = os.path.join(coverage_outdir, file_found)

            if sample_coverage_no_problems:
                sequence_to_analyse, run_successfully, problems_found, position, coverage = \
                    utils.extractVariableFromPickle(file_path)
                if run_successfully and not problems_found:
                    mean_coverage_data[sequence_to_analyse] = {'position': position, 'coverage': coverage,
                                                               'mean_coverage': round(
                                                                   (float(coverage) / float(position)), 2)}
                else:
                    print('WARNING: it was not possible to compute coverage information for'
                          ' sequence ' + sequence_to_analyse)
                    sample_coverage_no_problems = False

            os.remove(file_path)

    return sample_coverage_no_problems, mean_coverage_data
예제 #9
0
def gather_data_together(sample, data_directory, sequences_information, outdir,
                         debug_mode_true):
    run_successfully = True
    counter = 0
    sample_data = {}

    consensus_files = None

    write_consensus_first_time = True

    genes_directories = [
        d for d in os.listdir(data_directory) if not d.startswith('.')
        and os.path.isdir(os.path.join(data_directory, d, ''))
    ]
    for gene_dir in genes_directories:
        gene_dir_path = os.path.join(data_directory, gene_dir, '')

        files = [
            f for f in os.listdir(gene_dir_path) if not f.startswith('.')
            and os.path.isfile(os.path.join(gene_dir_path, f))
        ]
        for file_found in files:
            if file_found.startswith('coverage_info.') and file_found.endswith(
                    '.pkl'):
                file_path = os.path.join(gene_dir_path, file_found)

                if run_successfully:
                    run_successfully, sequence_counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences = utils.extractVariableFromPickle(
                        file_path)

                    if write_consensus_first_time:
                        for consensus_type in [
                                'correct', 'noMatter', 'alignment'
                        ]:
                            file_to_remove = os.path.join(
                                outdir,
                                str(sample + '.' + consensus_type + '.fasta'))
                            if os.path.isfile(file_to_remove):
                                os.remove(file_to_remove)
                        write_consensus_first_time = False
                    consensus_files = write_consensus(outdir, sample,
                                                      consensus_sequence)

                    sample_data[sequence_counter] = {
                        'header':
                        sequences_information[sequence_counter]['header'],
                        'gene_coverage':
                        100 - percentage_absent,
                        'gene_low_coverage':
                        percentage_lowCoverage,
                        'gene_number_positions_multiple_alleles':
                        multiple_alleles_found,
                        'gene_mean_read_coverage':
                        meanCoverage,
                        'gene_identity':
                        100 -
                        (float(number_diferences) /
                         sequences_information[sequence_counter]['length'])
                    }
                    counter += 1

        if not debug_mode_true:
            utils.removeDirectory(gene_dir_path)

    if counter != len(sequences_information):
        run_successfully = False

    return run_successfully, sample_data, consensus_files
예제 #10
0
def runFastQintegrity(fastq_files, threads, outdir):
    pass_qc = True
    failing = {}
    failing['sample'] = False
    not_corruption_found = True

    fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '')
    utils.removeDirectory(fastQintegrity_folder)
    os.mkdir(fastQintegrity_folder)

    pool = multiprocessing.Pool(processes=threads)
    for fastq in fastq_files:
        pool.apply_async(fastQintegrity, args=(fastq, fastQintegrity_folder,))
    pool.close()
    pool.join()

    encoding = {}
    files = [f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f))]
    for file_found in files:
        if file_found.endswith('.pkl'):
            file_run_successfully, file_encoding, min_reads_length, max_reads_length = utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found))
            if file_run_successfully:
                encoding[file_found] = {'file_encoding': file_encoding, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length}
            else:
                failing[os.path.splitext(file_found)[0]] = ['The file is possibly corrupt']
                print os.path.splitext(file_found)[0] + ': the file is possibly corrupt'
        os.remove(os.path.join(fastQintegrity_folder, file_found))

    if len(failing) > 1:
        failing.pop('sample')
        not_corruption_found = False
        pass_qc = False

    min_reads_length, max_reads_length = None, None

    if len(encoding) == 0:
        encoding = None
        print 'It was no possible to determine the FASTQ encodings'
    else:
        min_reads_length, max_reads_length = guess_encoding.determine_min_max_reads_length(encoding)

        if len(set([x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None])) == 1:
            encoding = [x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None][0]
            print 'Fastq quality encoding: ' + str(encoding)
        else:
            print 'It was no possible to determine the FASTQ encodings'
            print 'This was what has been found: ' + str(encoding)
            encoding = None

    utils.removeDirectory(fastQintegrity_folder)

    return not_corruption_found, pass_qc, failing, encoding, min_reads_length, max_reads_length
예제 #11
0
def runCampyGenomes(args):
    start_time = time.time()

    listRunIDs = utils.getListIDs(os.path.abspath(args.listRunIDs.name))
    outdir = os.path.abspath(args.outdir)
    utils.check_create_directory(outdir)
    asperaKey = args.asperaKey.name
    threads_to_use = [j for j in general_threads_to_use if j <= args.threads]

    # Start logger
    logfile, time_str = utils.start_logger(outdir)

    # Get general information
    utils.general_information(logfile, version, outdir, time_str)

    # Check programms
    requiredPrograms()

    # Randomize the list with Run IDs
    random.shuffle(listRunIDs)

    number_process = determineNumberProcess(threads_to_use)

    samples_each_threads = determineBatchSamples(listRunIDs, threads_to_use)

    run_successfully = 0
    with open(
            os.path.join(outdir, 'samples_with_problems.' + time_str + '.tab'),
            'wt') as writer_success:
        with open(os.path.join(outdir, 'running_times.' + time_str + '.tab'),
                  'wt') as writer_times:

            for threads in samples_each_threads:
                print '\n' + 'Running for ' + str(threads) + ' threads' + '\n'
                threads_dir = os.path.join(outdir,
                                           str(threads) + '_threads', '')
                utils.check_create_directory(threads_dir)

                pool = multiprocessing.Pool(processes=number_process[threads])
                for sample in samples_each_threads[threads]:
                    pool.apply_async(downloadAndINNUca,
                                     args=(
                                         threads_dir,
                                         sample,
                                         asperaKey,
                                         threads,
                                     ))
                pool.close()
                pool.join()

                removeFiles(threads_dir, '.log')
                removeFiles(threads_dir, 'getSeqENA.samples_with_problems.txt')
                removeFiles(threads_dir, '.cpu.txt')

                samples_directories = [
                    d for d in os.listdir(threads_dir) if not d.startswith('.')
                    and os.path.isdir(os.path.join(threads_dir, d, ''))
                ]
                for sample_dir in samples_directories:
                    sample_dir_path = os.path.join(threads_dir, sample_dir, '')

                    files = [
                        f for f in os.listdir(sample_dir_path)
                        if not f.startswith('.')
                        and os.path.isfile(os.path.join(sample_dir_path, f))
                    ]
                    for file_found in files:
                        file_path = os.path.join(sample_dir_path, file_found)
                        if file_found == sample_dir + '_run_successfully.pkl':
                            sample_run_successfully = utils.extractVariableFromPickle(
                                file_path)
                            if not sample_run_successfully:
                                writer_success.write(sample_dir + '\t' +
                                                     threads_dir + '\n')
                            else:
                                run_successfully += 1
                            os.remove(file_path)
                        elif file_found == sample_dir + '_downloadAndINNUca_time.pkl':
                            time_taken = utils.extractVariableFromPickle(
                                file_path)
                            writer_times.write(sample_dir + '\t' +
                                               threads_dir + '\t' +
                                               str(time_taken) + '\n')
                            os.remove(file_path)

    time_taken = utils.runTime(start_time)
    del time_taken

    if run_successfully == 0:
        sys.exit('No RunIDs were successfully run!')
    else:
        print str(run_successfully) + ' samples out of ' + str(
            len(listRunIDs)) + ' run successfully'
예제 #12
0
def gather_gene_data_together(data_directory, sequences_information):
    run_successfully = True
    counter = 0
    sample_data = {}

    genes_directories = [
        d for d in os.listdir(data_directory) if not d.startswith('.')
        and os.path.isdir(os.path.join(data_directory, d, ''))
    ]
    for gene_dir in genes_directories:
        gene_dir_path = os.path.join(data_directory, gene_dir, '')

        files = [
            f for f in os.listdir(gene_dir_path) if not f.startswith('.')
            and os.path.isfile(os.path.join(gene_dir_path, f))
        ]
        for file_found in files:
            if file_found.startswith('coverage_info.') and file_found.endswith(
                    '.pkl'):
                file_path = os.path.join(gene_dir_path, file_found)

                if run_successfully:
                    run_successfully, sequence_counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage = utils.extractVariableFromPickle(
                        file_path)
                    sample_data[sequence_counter] = {
                        'header':
                        sequences_information[sequence_counter]['header'],
                        'gene_coverage': 100 - percentage_absent,
                        'gene_low_coverage': percentage_lowCoverage,
                        'gene_number_positions_multiple_alleles':
                        multiple_alleles_found,
                        'gene_mean_read_coverage': meanCoverage
                    }
                    counter += 1

        utils.removeDirectory(gene_dir_path)

    if counter != len(sequences_information):
        run_successfully = False

    return run_successfully, sample_data