예제 #1
0
def parse_fastqc(fqc):
    assert op.isfile(fqc), "%s not exist" % fqc
    qc = Fadapa(fqc)
    r = dict()
    for ary in qc.clean_data('Basic Statistics'):
        r[ary[0]] = ary[1]
    return r
예제 #2
0
def parse_fastqc(fqc):
    assert op.isfile(fqc), "%s not exist" % fqc
    qc = Fadapa(fqc)
    r = dict()
    for ary in qc.clean_data('Basic Statistics'):
        r[ary[0]] = ary[1]
    return r
예제 #3
0
    def format_fastqc(self, rawDataPath, currSample):

        ## FastQC templates
        fastqc_template = os.path.join(self.TEMPLATES_BASE, 'fastqc.html')

        ## just f****n lump it all in there for now and figure out what you want to format next
        fqc_object = Fadapa(rawDataPath)

        ## Module status data
        module_summary = fqc_object.summary()
        module_stats = module_summary[1][0]
        module_pbsq = module_summary[2][0]
        module_ptsq = module_summary[3][0]
        module_psqs = module_summary[4][0]
        module_pbsc = module_summary[5][0]
        module_psgcc = module_summary[6][0]
        module_pbnc = module_summary[7][0]
        module_seqlendist = module_summary[8][0]
        module_seqdup = module_summary[9][0]
        module_overrep = module_summary[10][0]
        module_adapter = module_summary[11][0]

        ## Basic statistics data
        basic_stats = fqc_object.clean_data('Basic Statistics')
        file_name = basic_stats[1][1]
        file_type = basic_stats[2][1]
        encoding = basic_stats[3][1]
        total_sequences = basic_stats[4][1]
        poor_quality = basic_stats[5][1]
        seq_len = basic_stats[6][1]
        gc_pcnt = basic_stats[7][1]

        ## FastQC html template file with data inserted
        fqc_return = ''
        f = open(fastqc_template, 'r')
        for line in f:
            line = line.format(MODULE_STATS=module_stats,
                               MODULE_PBSQ=module_pbsq,
                               MODULE_PTSQ=module_ptsq,
                               MODULE_PSQS=module_psqs,
                               MODULE_PBSC=module_pbsc,
                               MODULE_PSGCC=module_psgcc,
                               MODULE_PBNC=module_pbnc,
                               MODULE_SEQLENDIST=module_seqlendist,
                               MODULE_SEQDUP=module_seqdup,
                               MODULE_OVERREP=module_overrep,
                               MODULE_ADAPTER=module_adapter,
                               FQC_FILENAME=file_name,
                               FQC_FILETYPE=file_type,
                               FQC_ENCODING=encoding,
                               FQC_TOTALSEQ=total_sequences,
                               FQC_POORQUAL=poor_quality,
                               FQC_SEQLEN=seq_len,
                               FQC_GCPCNT=gc_pcnt)
            fqc_return = '{0}{1}'.format(fqc_return, line)
        f.close()

        ## return formatted FastQC report
        return fqc_return
예제 #4
0
    def format_fastqc_graphs(self, rawDataPath, currSample):

        ## Object to get data from FastQC output
        fqc_object = Fadapa(rawDataPath)

        ## Target output dictionary
        fastqc_graphdata = {}

        ## Unextracted data
        fqc_pbsq_data = fqc_object.clean_data('Per base sequence quality')
        fqc_pbnc_data = fqc_object.clean_data('Per base N content')
        fqc_seqlen_data = fqc_object.clean_data('Sequence Length Distribution')

        ##
        ## Per Base Pair Sequence Quality
        ## min = item[5], q1 = item[3], median = item[2], q3 = item[4], max = item[6]
        pbsq_labels = []; pbsq_values = []; pbsq_means = []
        for item in fqc_pbsq_data[1:]:
            pbsq_labels.append(item[0]) ## label for bin
            pbsq_means.append(int(float(item[1]))) ## sample running mean
            bin_values = [item[5],item[3],item[2],item[4],item[6]]
            bin_values = [0.0 if x=='NaN' else x for x in bin_values] ## replace NaN with 0
            bin_values = [int(float(x)) for x in bin_values] ## convert str of float->float->int
            pbsq_values.append(bin_values)
        fastqc_graphdata['PBSQ_TITLE'] = 'FastQC Per base sequence quality'
        fastqc_graphdata['PBSQ_LABELS'] = str(pbsq_labels)
        fastqc_graphdata['PBSQ_VALUES'] = str(pbsq_values)
        fastqc_graphdata['PBSQ_MEANVAL'] = str(pbsq_means)
        fastqc_graphdata['PBSQ_DESCR'] = 'Per base sequence quality'
        fastqc_graphdata['PBSQ_X'] = 'Position in read (BP)'
        fastqc_graphdata['PBSQ_Y'] = 'PHRED quality score'

        ##
        ## Per Base Pair N Content
        fastqc_graphdata['PBNC_TITLE'] = 'FastQC Per base N content for {}'.format(currSample)
        pbnc_labels = []; pbnc_values = []
        for item in fqc_pbnc_data[1:]:
            pbnc_labels.append(item[0]); pbnc_values.append(item[1])
        fastqc_graphdata['PBNC_LABELS'] = str(pbnc_labels)
        fastqc_graphdata['PBNC_VALUES'] = str(pbnc_values)
        fastqc_graphdata['PBNC_DESCR'] = 'N content per base'
        fastqc_graphdata['PBNC_X'] = 'Position in read (BP)'
        fastqc_graphdata['PBNC_Y'] = 'Percentage content (%)'

        ##
        ## Sequence Length Distribution
        fastqc_graphdata['SQLD_TITLE'] = 'FastQC Sequence length distribution for {}'.format(currSample)
        dist_labels = []; dist_values = []
        for item in fqc_seqlen_data[1:]:
            dist_labels.append(item[0]); dist_values.append(item[1])
        fastqc_graphdata['SQLD_LABELS'] = str(dist_labels)
        fastqc_graphdata['SQLD_VALUES'] = str(dist_values)
        fastqc_graphdata['SQLD_DESCR'] = 'Sequence length population'
        fastqc_graphdata['SQLD_X'] = 'Sequence length (BP)'
        fastqc_graphdata['SQLD_Y'] = 'Population (#)'

        return fastqc_graphdata
예제 #5
0
    def save_sections_into_file(self):

        data_file = os.path.join(self._dir, "fastqc_data.txt")
        if os.path.exists(data_file) and Fadapa:
            parser = Fadapa(data_file)
            module = [m[1] for m in parser.summary()][2:9]
            for m in module:
                out_file = os.path.join(self._dir, m.replace(" ", "_") + ".tsv")
                dt = self._get_module(parser, m)
                dt.to_csv(out_file, sep="\t", index=False)
예제 #6
0
	def FastQC(self):

		"""
		Run FastQC on target files
		Extract information from output
		Set it to object attributes as required
		:return: NoThInG
		"""

		## Target SeqQC/fastqc-stage outdir
		io_trunk = self.sequencepair_object.get_qcpath()
		target_output = os.path.join(io_trunk, self.stage)

		## Run process on specific data (init/trimmed/etc)
		fqfile = self.sequencepair_object.get_forwardfastq()
		force_mkdir(target_output)
		fastqc_process = subprocess.Popen(
			['fastqc', '--quiet', '--extract', '-t', THREADS, '-o', target_output, fqfile], stdout=subprocess.PIPE,
			stderr=subprocess.PIPE)
		fastqc_process.wait()

		## Remove ZIP of results
		for candidate in glob.glob(os.path.join(target_output,'*.zip')): os.remove(candidate)

		## Get path for fastqc_data.txt for current execution
		target_file = ''
		for root, dirs, files in os.walk(target_output):
			for name in files:
				if name.endswith('fastqc_data.txt'):
					target_file = os.path.join(root, name)

		## Number of reads present; for end-report i/o
		## Append path to FQC report so we can scrape at will
		f = Fadapa(target_file)
		stats = f.clean_data('Basic Statistics')
		pbsq = f.clean_data('Per base sequence quality')
		read_count = [x for x in stats if 'Total Sequences' in x][0][1]
		gc_pcnt = [x for x in stats if '%GC' in x][0][1]

		if self.stage == 'Initial':
			self.sequencepair_object.set_initial_readcount(read_count)
			self.sequencepair_object.set_initial_fastqc(target_file)
			self.sequencepair_object.set_initial_pbsq(pbsq)
			self.sequencepair_object.set_initial_gcpcnt(gc_pcnt)
		if self.stage == 'PostDMPX':
			self.sequencepair_object.set_postdmpx_readcount(read_count)
			self.sequencepair_object.set_postdmpx_fastqc(target_file)
			self.sequencepair_object.set_postdmpx_pbsq(pbsq)
			self.sequencepair_object.set_postdmpx_gcpcnt(gc_pcnt)
		if self.stage == 'PostTrim':
			self.sequencepair_object.set_posttrim_readcount(read_count)
			self.sequencepair_object.set_posttrim_fastqc(target_file)
			self.sequencepair_object.set_posttrim_pbsq(pbsq)
			self.sequencepair_object.set_posttrim_gcpcnt(gc_pcnt)
def fastqc_parser(fastqcDir, filename):
    '''FASTQC PARSER
    Extracts info from fastqc output files --> currently uses FADAPA parser
    '''
    fastqcD = {}
    fastqcF = sorted(glob.glob(os.path.join(fastqcDir, "*_fastqc", filename)))
    for f in fastqcF:
        f_1 = Fadapa(f)
        basicStats = f_1.clean_data('Basic Statistics')
        fName = basicStats[1][1].rsplit('.f')[0]
        numSeqs = basicStats[4][1]
        longestRead = f_1.clean_data('Basic Statistics')[6][
            1]  #.rsplit('-')[1]
        fastqcD[fName] = [numSeqs, longestRead]
    return fastqcD
예제 #8
0
 def test_multi_data(self):
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter('always')
         Fadapa('tests/fastqc_multiple.zip')
         self.assertEqual(len(w), 1)
         self.assertTrue(issubclass(w[-1].category, UserWarning))
         self.assertTrue('Multiple files' in str(w[-1].message))
         self.assertTrue(
             'Choosing one_fastqc_data.txt' in str(w[-1].message))
         self.assertTrue(
             'Choosing two_fastqc_data.txt' not in str(w[-1].message))
예제 #9
0
class TestFadapa(unittest.TestCase):
    def setUp(self):
        self.p_data = Fadapa('tests/fastqc_data.txt')

    def test_summary(self):
        summary = self.p_data.summary()
        self.assertEqual(summary[0], ['Module Name', 'Status'])

    def test_content(self):
        sys.stdout = StringIO()
        self.p_data.content()
        self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC')

    def test_raw_data(self):
        data = self.p_data.raw_data('Basic Statistics')
        self.assertEqual(data[-1], '>>END_MODULE')

    def test_cleaned_data(self):
        data = self.p_data.clean_data('Basic Statistics')
        self.assertEqual(data[0][0], 'Measure')
예제 #10
0
class TestFadapa(unittest.TestCase):

    def setUp(self):
        self.p_data = Fadapa('tests/fastqc_data.txt')

    def test_summary(self):
        summary = self.p_data.summary()
        self.assertEqual(summary[0], ['Module Name', 'Status'])

    def test_content(self):
        sys.stdout = StringIO()
        self.p_data.content()
        self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC')

    def test_raw_data(self):
        data = self.p_data.raw_data('Basic Statistics')
        self.assertEqual(data[-1], '>>END_MODULE')

    def test_cleaned_data(self):
        data = self.p_data.clean_data('Basic Statistics')
        self.assertEqual(data[0][0], 'Measure')
예제 #11
0
    def format_fastqc(self, rawDataPath, currSample):

        ## FastQC templates
        fastqc_template = os.path.join(self.TEMPLATES_BASE, 'fastqc.html')

        ## just f****n lump it all in there for now and figure out what you want to format next
        fqc_object = Fadapa(rawDataPath)

        ## Module status data
        module_summary = fqc_object.summary()
        module_stats = module_summary[1][0]; module_pbsq = module_summary[2][0]; module_ptsq = module_summary[3][0];
        module_psqs = module_summary[4][0]; module_pbsc = module_summary[5][0]; module_psgcc = module_summary[6][0];
        module_pbnc = module_summary[7][0]; module_seqlendist = module_summary[8][0]; module_seqdup = module_summary[9][0];
        module_overrep = module_summary[10][0]; module_adapter = module_summary[11][0]

        ## Basic statistics data
        basic_stats = fqc_object.clean_data('Basic Statistics')
        file_name = basic_stats[1][1]; file_type = basic_stats[2][1]; encoding = basic_stats[3][1]
        total_sequences = basic_stats[4][1]; poor_quality = basic_stats[5][1]; seq_len = basic_stats[6][1]
        gc_pcnt = basic_stats[7][1]

        ## FastQC html template file with data inserted
        fqc_return = ''
        f = open(fastqc_template, 'r')
        for line in f:
            line = line.format(
            MODULE_STATS = module_stats, MODULE_PBSQ = module_pbsq, MODULE_PTSQ = module_ptsq,
            MODULE_PSQS = module_psqs, MODULE_PBSC = module_pbsc, MODULE_PSGCC = module_psgcc,
            MODULE_PBNC = module_pbnc, MODULE_SEQLENDIST = module_seqlendist, MODULE_SEQDUP = module_seqdup,
            MODULE_OVERREP = module_overrep, MODULE_ADAPTER = module_adapter,
            FQC_FILENAME = file_name, FQC_FILETYPE = file_type, FQC_ENCODING = encoding,
            FQC_TOTALSEQ = total_sequences, FQC_POORQUAL = poor_quality, FQC_SEQLEN = seq_len,
            FQC_GCPCNT = gc_pcnt
            )
            fqc_return = '{0}{1}'.format(fqc_return, line)
        f.close()

        ## return formatted FastQC report
        return fqc_return
예제 #12
0
def _merge_fastq(data):
    """
    merge all fastqc samples into one by module
    """
    fastqc_list = {}
    for sample in data:
        name = dd.get_sample_name(sample[0])
        fn = os.path.join(dd.get_work_dir(sample[0]), "qc", dd.get_sample_name(sample[0]), "fastqc", "fastqc_data.txt")
        fastqc_list[name] = Fadapa(fn)

    module = [m[1] for m in fastqc_list[name].summary()][2:9]
    for m in module:
        out_file = os.path.join(m.replace(" ", "_") + ".tsv")
        dt = _get_module(fastqc_list, m)
        dt.to_csv(out_file, sep="\t", index=False)
    return [data]
예제 #13
0
###Parses fastqc output to find overrepresented sequences to feed into cutadapt with a capital -A command

# imports
from fadapa import Fadapa
import sys

#take argument from bash script (which will be $j - the UID of the cell)
name = sys.argv[1]

#load file into fadapa parser
f = Fadapa('/home/graham/Downloads/trial/' + name + '_fastqc/fastqc_data.txt')

#get raw data for Overrepresented sequences
pass_seq = f.raw_data('Overrepresented sequences')[0]

#Initialise list of seqs
list_of_seqs = []

#If there are no overrepresented sequences, the clean parser breaks!
#Therefore, we cannot reference .clean unless .raw contains something
if pass_seq != ">>Overrepresented sequences	pass":
    #Loop through the .clean parsed data
    for data in f.clean_data('Overrepresented sequences'):
        #Add the first index of the .clean data to list
        #First entry will by #Sequence, subsequent will be the actual seqs
        list_of_seqs.append(data[0])

#Create empty output string
output = ""

#Loop through the list of sequeces from index 1 onwards (as the index 0 will be #Sequence)
예제 #14
0
def reorganize(sample_dir):
    try:
        assert (os.path.isdir(sample_dir))
    except:
        sys.stderr.write(
            "ERROR: Sample directory doesn't seem to exist! Exiting now ...\n")
        raise RuntimeError

    sample_dir = os.path.abspath(sample_dir) + '/'

    # set up directory structure
    workspace_name = "LSARP_Results/"
    workspace = sample_dir + workspace_name
    if not os.path.isdir(workspace):
        workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'LSARP_Table_Creation.log'
    logObject = uF.createLoggerObject(log_file)

    sample = sample_dir.split('/')[-2]
    logObject.info("Creating easy upload formats for sample %s", sample)
    logObject.info("-" * 80)

    # FASTQC Tables

    logObject.info('Creating FastQC Data Tables.')
    logObject.info('-' * 80)

    FastQC_results = 'FastQC/'
    FastQC_results_workspace = workspace + FastQC_results
    fastqc_modules = [
        'Per base sequence quality', 'Per tile sequence quality',
        'Per sequence quality scores', 'Per base sequence content',
        'Per sequence GC content', 'Per base N content',
        'Sequence Length Distribution', 'Sequence Duplication Levels',
        'Overrepresented sequences', 'Adapter Content'
    ]

    try:
        fastqc_zipped_data_dirs = [
            sample_dir + 'FastQC/' + zd
            for zd in os.listdir(sample_dir + 'FastQC/') if zd.endswith('.zip')
        ]
        assert (len(fastqc_zipped_data_dirs) > 0)
        for zd in fastqc_zipped_data_dirs:
            assert (os.path.isfile(zd))
        if not os.path.isdir(FastQC_results_workspace):
            FastQC_results_workspace = uF.setupDirectory(
                workspace, FastQC_results)
    except:
        logObject.error(
            'No FastQC results available or path is unable to be determined!')
    else:
        for zd in fastqc_zipped_data_dirs:
            with zipfile.ZipFile(zd) as z:
                for filename in z.namelist():
                    if filename.split('/')[-1] == 'fastqc_data.txt':
                        with z.open(filename) as fh:
                            FastQC_tmp_out = open(
                                FastQC_results_workspace + 'tmp.txt', 'wb')
                            for line in fh:
                                FastQC_tmp_out.write(line)
                            FastQC_tmp_out.close()
                            fadapa = Fadapa(FastQC_results_workspace +
                                            'tmp.txt')
                            for module in fastqc_modules:
                                try:
                                    table_file = '_'.join(module.split())
                                    cleaned_module_data = fadapa.clean_data(
                                        module)
                                    if cleaned_module_data:
                                        table_handle = open(
                                            FastQC_results_workspace +
                                            table_file + '.table.txt', 'w')
                                        for i, split_line in enumerate(
                                                cleaned_module_data):
                                            if i == 0:
                                                split_line = [
                                                    'sample', 'read'
                                                ] + split_line
                                            else:
                                                split_line = [
                                                    sample_dir.split('/')[-2],
                                                    zd.split('/')[-1].split(
                                                        sample_dir.split('/')
                                                        [-2] + '_')[1].split(
                                                            '_fastqc.zip')
                                                    [0].split('.')[0]
                                                ] + split_line
                                            table_handle.write(
                                                '\t'.join(split_line) + '\n')
                                        table_handle.close()
                                except:
                                    pass
                            os.system('rm -f %s' % FastQC_results_workspace +
                                      'tmp.txt')
    logObject.info('*' * 80)

    # Centrifuge Tables

    logObject.info('Creating Centrifuge Data Tables.')
    logObject.info('-' * 80)

    Centrifuge_results = 'Centrifuge/'
    Centrifuge_results_workspace = workspace + Centrifuge_results

    centrifuge_report_file = sample_dir + 'Centrifuge/' + sample_dir.split(
        '/')[-2] + '_centrifuge_report.tsv'
    kraken_report_file = sample_dir + 'Centrifuge/' + sample_dir.split(
        '/')[-2] + '_centrifuge_kraken_report.txt'

    try:
        assert (os.path.isfile(centrifuge_report_file)
                and os.path.isfile(kraken_report_file))
        if not os.path.isdir(Centrifuge_results_workspace):
            Centrifuge_results_workspace = uF.setupDirectory(
                workspace, Centrifuge_results)

        centrifuge_report_table_file = Centrifuge_results_workspace + 'centrifuge_report.table.txt'
        centrifuge_report_table_handle = open(centrifuge_report_table_file,
                                              'w')

        centrifuge_report_data = defaultdict(lambda: ['NA'] * 6)
        for i, line in enumerate(open(centrifuge_report_file)):
            if i > 0:
                line = line.rstrip('\n')
                name, taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance = line.split(
                    '\t')
                centrifuge_report_data[name] = [
                    taxID, taxRank, genomeSize, numReads, numUniqueReads,
                    abundance
                ]

        header = [
            'sample', 'taxonomy_name', 'taxonomy_level', 'taxonomy_rank',
            'taxonomy_id', 'genome_size', 'centrifuge_abundance',
            'percentage_of_fragments_recursively_covered',
            'number_of_fragments_recursively_included',
            'number_of_fragments_direct'
        ]
        centrifuge_report_table_handle.write('\t'.join(header) + '\n')
        for i, line in enumerate(open(kraken_report_file)):
            line = line.rstrip('\n')
            prop, frag_recurse, frag_direct, tax_level, tax_id = line.split(
            )[:5]
            tax = ' '.join(line.split()[5:]).strip()
            taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance = centrifuge_report_data[
                tax]
            centrifuge_report_table_handle.write('\t'.join([
                sample_dir.split('/')[-2], tax, tax_level, taxRank, taxID,
                genomeSize, abundance, prop, frag_recurse, frag_direct
            ]) + '\n')

        centrifuge_report_table_handle.close()
    except:
        logObject.error('No Centrifuge results available!')

    logObject.info('*' * 80)

    # AMRP Tables

    logObject.info('Moving Results from ARIBA and ShortBRED AMR Searches.')
    logObject.info('-' * 80)

    AMRP_results = 'AMRP_Searches/'
    AMRP_results_workspace = workspace + AMRP_results

    try:
        AMRP_dir = sample_dir + 'AMRP/'
        assert (os.path.isdir(AMRP_dir))
        if not os.path.isdir(workspace + AMRP_dir):
            AMRP_results_workspace = uF.setupDirectory(workspace, AMRP_results)
        for sd in os.listdir(AMRP_dir):
            ariba_dir = AMRP_dir + sd + '/'
            ariba_report = ariba_dir + 'report.tsv'
            if os.path.isfile(ariba_report):
                ariba_result = AMRP_results_workspace + sample_dir.split(
                    '/')[-2] + '_' + sd + '_ariba_results.txt'
                os.system('cp %s %s' % (ariba_report, ariba_result))
    except:
        logObject.error('Unable to create AMR prediction data tables.'
                        )  # Raising exception now ...')

    logObject.info('*' * 80)

    # MLST Tables

    logObject.info('Creating MLST Data Tables.')
    logObject.info('-' * 80)

    MLST_results = 'MLST/'
    MLST_results_workspace = workspace + MLST_results

    try:
        MLST_dir = sample_dir + 'MLST/'
        MLST_result_file = MLST_dir + 'ariba_mlst/mlst_report.tsv'

        if not os.path.isdir(MLST_results_workspace):
            MLST_results_workspace = uF.setupDirectory(workspace, MLST_results)
        os.system('cp %s %s' % (MLST_result_file, MLST_results_workspace))

    except:
        logObject.error('Unable to create MLST call data tables.'
                        )  # Raising exception now ...')
        #raise RuntimeError

    logObject.info('*' * 80)

    # De Novo Assembly Storage

    logObject.info('Moving de novo assembly to results directory.')
    logObject.info('-' * 80)

    Assembly_results = 'Assembly/'
    Assembly_results_workspace = workspace + Assembly_results

    try:
        Assembly_dir = sample_dir + 'Assembly/'
        Assembly_original_location = Assembly_dir + 'assembly.fasta'
        if not os.path.isfile(Assembly_original_location):
            Assembly_original_location = Assembly_dir + 'scaffolds.fasta'
        assert (os.path.isfile(Assembly_original_location))
        if not os.path.isdir(Assembly_results_workspace):
            Assembly_results_workspace = uF.setupDirectory(
                workspace, Assembly_results)
        Assembly_new_location = Assembly_results_workspace + sample_dir.split(
            '/')[-2] + '.genome.fa'
        os.system('cp %s %s' %
                  (Assembly_original_location, Assembly_new_location))
    except:
        logObject.error('Unable to move assembly to results directory.')

    logObject.info('*' * 80)

    # Assembly QC Storage

    logObject.info('Moving GAEMR assembly QC to results directory.')
    logObject.info('-' * 80)

    try:
        Assembly_QC_new_location = workspace + 'Assembly_QC/'
        Assembly_QC_original_dir = sample_dir + 'GAEMR/QC/'
        assert (os.path.isdir(Assembly_QC_original_dir))
        os.system('cp -r %s %s' %
                  (Assembly_QC_original_dir, Assembly_QC_new_location))

    except:
        logObject.error(
            'Unable to move GAEMR assembly QC to results directory.')

    logObject.info('*' * 80)

    # Pilon Results Storage

    logObject.info('Moving Pilon output to results directory.')
    logObject.info('-' * 80)

    try:
        Pilon_new_dir = workspace + 'Reference_Assembly_and_Variant_Calling/'
        Pilon_original_dir = sample_dir + 'Pilon/results/'
        assert (os.path.isdir(Pilon_original_dir))
        os.system('cp -r %s %s' % (Pilon_original_dir, Pilon_new_dir))
        os.system('gzip %s*' % Pilon_new_dir)

    except:
        logObject.error('Unable to move Pilon output to results directory.')

    logObject.info('*' * 80)

    # StrainGST Results Storage

    logObject.info('Moving StrainGST output to results directory.')
    logObject.info('-' * 80)

    try:
        Straingst_result_file = sample_dir + 'StrainGST/' + sample + '.straingst_result.tsv'
        assert (os.path.isfile(Straingst_result_file))
        Straingst_new_dir = 'StrainGST/'
        Straingst_results_workspace = workspace + Straingst_new_dir
        if not os.path.isdir(workspace + Straingst_new_dir):
            Straingst_results_workspace = uF.setupDirectory(
                workspace, Straingst_new_dir)
        os.system('cp %s %s' %
                  (Straingst_result_file, Straingst_results_workspace))

    except:
        logObject.error(
            'Unable to move StrainGST output to results directory.')

    logObject.info('*' * 80)

    uF.closeLoggerObject(logObject)

    # create successful completion file if steps completed!
    conf_file = open(sample_dir + "LSARP.txt", 'w')
    conf_file.write("LSARP Table Creation: Module Completed Succesfully!")
    conf_file.close()
예제 #15
0
def fadapa_from_filepath(filepath):
    return Fadapa(filepath)
예제 #16
0
 def setUp(self):
     self.p_data = Fadapa('tests/fastqc.zip')
예제 #17
0
    def format_fastqc_graphs(self, rawDataPath, currSample):

        ## Object to get data from FastQC output
        fqc_object = Fadapa(rawDataPath)

        ## Target output dictionary
        fastqc_graphdata = {}

        ## Unextracted data
        fqc_pbsq_data = fqc_object.clean_data('Per base sequence quality')
        fqc_pbnc_data = fqc_object.clean_data('Per base N content')
        fqc_seqlen_data = fqc_object.clean_data('Sequence Length Distribution')

        ##
        ## Per Base Pair Sequence Quality
        ## min = item[5], q1 = item[3], median = item[2], q3 = item[4], max = item[6]
        pbsq_labels = []
        pbsq_values = []
        pbsq_means = []
        for item in fqc_pbsq_data[1:]:
            pbsq_labels.append(item[0])  ## label for bin
            pbsq_means.append(int(float(item[1])))  ## sample running mean
            bin_values = [item[5], item[3], item[2], item[4], item[6]]
            bin_values = [0.0 if x == 'NaN' else x
                          for x in bin_values]  ## replace NaN with 0
            bin_values = [int(float(x)) for x in bin_values
                          ]  ## convert str of float->float->int
            pbsq_values.append(bin_values)
        fastqc_graphdata['PBSQ_TITLE'] = 'FastQC Per base sequence quality'
        fastqc_graphdata['PBSQ_LABELS'] = str(pbsq_labels)
        fastqc_graphdata['PBSQ_VALUES'] = str(pbsq_values)
        fastqc_graphdata['PBSQ_MEANVAL'] = str(pbsq_means)
        fastqc_graphdata['PBSQ_DESCR'] = 'Per base sequence quality'
        fastqc_graphdata['PBSQ_X'] = 'Position in read (BP)'
        fastqc_graphdata['PBSQ_Y'] = 'PHRED quality score'

        ##
        ## Per Base Pair N Content
        fastqc_graphdata[
            'PBNC_TITLE'] = 'FastQC Per base N content for {}'.format(
                currSample)
        pbnc_labels = []
        pbnc_values = []
        for item in fqc_pbnc_data[1:]:
            pbnc_labels.append(item[0])
            pbnc_values.append(item[1])
        fastqc_graphdata['PBNC_LABELS'] = str(pbnc_labels)
        fastqc_graphdata['PBNC_VALUES'] = str(pbnc_values)
        fastqc_graphdata['PBNC_DESCR'] = 'N content per base'
        fastqc_graphdata['PBNC_X'] = 'Position in read (BP)'
        fastqc_graphdata['PBNC_Y'] = 'Percentage content (%)'

        ##
        ## Sequence Length Distribution
        fastqc_graphdata[
            'SQLD_TITLE'] = 'FastQC Sequence length distribution for {}'.format(
                currSample)
        dist_labels = []
        dist_values = []
        for item in fqc_seqlen_data[1:]:
            dist_labels.append(item[0])
            dist_values.append(item[1])
        fastqc_graphdata['SQLD_LABELS'] = str(dist_labels)
        fastqc_graphdata['SQLD_VALUES'] = str(dist_values)
        fastqc_graphdata['SQLD_DESCR'] = 'Sequence length population'
        fastqc_graphdata['SQLD_X'] = 'Sequence length (BP)'
        fastqc_graphdata['SQLD_Y'] = 'Population (#)'

        return fastqc_graphdata
예제 #18
0
#searching for overrepresented sequences within a genome sequence and producing an output for the extraction of these sequences in a upper case format as the sequences can be in lower or upper case. It is used instead of FASTQC when FASTQC cannot find any sequences. This can then be used by cutadapt

from fadapa import Fadapa
import sys
#input
file_one = sys.argv[1]

f = Fadapa('/home/rsk17/Group_project/1_Original_pipeline/' + file_one +
           '/fastqc_data.txt')
#look for certain phrases ie Overrepresented sequences
good_seq = f.raw_data("Overrepresented sequences")[0]
#create an empty list fo the overrepresented sequences to be appended to
seq_list = []

#if good_seq is pass in this case use ==
#if there is no overrepresented sequences pass then the loop continues, otherwise the loop is broken
#find when Overrepresented sequences are not a pass (!=)
if good_seq != ">>Overrepresented sequences     pass":
    for data in f.clean_data('Overrepresented sequences'):
        seq_list.append(data[0])

#this gets the output into the correct format for cutadapt (-A XXXX etc)
#creates output and uses the seq_list from before to output the found sequences into a format that can be used by cutadapt.
output = ""
#seq_list[1:] means that everything that isnt seq_list[0] (#sequence)
for items in seq_list[1:]:
    output = output + "-A" + items

print(output)
예제 #19
0
    def fastqc_result(self, r1, r2, sample, output, type):
        fq1 = Fadapa(r1)
        fq2 = Fadapa(r2)
        fastqc = {}
        fastqc_summary = {}
        fastqc_pass = {}
        fastqc_per_base_quality = {}
        fastqc_per_sequence_quality = {}
        fastqc_per_sequence_quality_r1 = {}
        fastqc_per_sequence_quality_r2 = {}
        fastqc_sequence_length_distribution = {}
        fastqc_sequence_length_distribution_r1 = {}
        fastqc_sequence_length_distribution_r2 = {}

        # fastqc per sequence quality scores for report
        # R1
        f = open(
            "%s/data/stat/%s.fastq_quality_score_r1.txt" % (output, sample),
            "w")
        count = 0
        for data in fq1.raw_data('Per sequence quality scores'):
            if (count > 3):
                break
            if data.startswith('>>Per') or data.startswith(
                    '#Qual') or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()
        # R2
        f = open(
            "%s/data/stat/%s.fastq_quality_score_r2.txt" % (output, sample),
            "w")
        count = 0
        for data in fq2.raw_data('Per sequence quality scores'):
            if (count > 3):
                break
            if data.startswith('>>Per') or data.startswith(
                    '#Qual') or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc per base squence for report
        f = open(
            "%s/data/stat/%s.base_sequence_quality.txt" % (output, sample),
            "w")
        count = 0
        for data in fq1.raw_data('Per base sequence quality'):
            if (count > 3):
                break
            if data.startswith(">>Per") or data.startswith(
                    "#Base") or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc sequence length distribution for report
        f = open(
            "%s/data/stat/%s.sequence_length_distribution.txt" %
            (output, sample), "w")
        count = 0
        for data in fq1.raw_data('Sequence Length Distribution'):
            if (count > 3):
                break
            if data.startswith(">>Seq") or data.startswith(
                    "#") or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc parse for json
        for data in fq1.clean_data('Sequence Length Distribution'):
            if data[0] != "Length":
                fastqc_sequence_length_distribution_r1[data[0]] = data[1]
        for data in fq2.clean_data('Sequence Length Distribution'):
            if data[0] != "Length":
                fastqc_sequence_length_distribution_r2[data[0]] = data[1]

        total_reads_r1 = 0
        total_reads_r2 = 0
        above_30_r1 = 0
        above_30_r2 = 0
        for data in fq1.clean_data('Per sequence quality scores'):
            if data[0] != "Quality":
                fastqc_per_sequence_quality_r1[data[0]] = data[1]
                total_reads_r1 = total_reads_r1 + float(data[1])
                if int(data[0]) >= 30:
                    above_30_r1 = above_30_r1 + float(data[1])

        for data in fq2.clean_data('Per sequence quality scores'):
            if data[0] != "Quality":
                fastqc_per_sequence_quality_r2[data[0]] = data[1]
                total_reads_r2 = total_reads_r2 + float(data[1])
                if int(data[0]) >= 30:
                    above_30_r2 = above_30_r2 + float(data[1])

        #mean read quality(percentage of reads with mean Phred base quality above 30)
        mean_read_quality_percentage = (above_30_r1 + above_30_r2) / (
            total_reads_r1 + total_reads_r2) * 100
        mean_read_quality = {}
        mean_read_quality['total_read'] = total_reads_r1 + total_reads_r2
        mean_read_quality['above_30'] = above_30_r1 + above_30_r2
        mean_read_quality['percentage'] = mean_read_quality_percentage
        if type == "raw":
            if 90 < mean_read_quality_percentage:
                mean_read_quality['message'] = "pass"
                self._qc_pass_count = self._qc_pass_count + 1
            else:
                message = "warn"
                mean_read_quality['message'] = "warn"

        for data in fq1.clean_data('Per base sequence quality'):
            #base, mean, median
            if data[0] != "base_Base":
                fastqc_per_base_quality[data[0]] = "{0}:{1}:{2}".format(
                    data[0], data[1], data[2])

        for data in fq1.clean_data('Basic Statistics'):
            if data[0] != "Measure":
                fastqc_summary[data[0]] = data[1]

        for data in fq1.summary():
            if data[1] != "Module Name":
                fastqc_pass[data[1]] = data[0]

        fastqc['mean_read_quality'] = mean_read_quality
        fastqc[
            'fastqc_sequence_length_distribution'] = fastqc_sequence_length_distribution
        fastqc['per_sequence_quality_score'] = fastqc_per_sequence_quality
        fastqc['per_base_quality'] = fastqc_per_base_quality
        fastqc['summary'] = fastqc_summary
        fastqc['pass'] = fastqc_pass
        fastqc['fastq_file_name'] = "%s-%s" % (r1, r2)
        return fastqc
예제 #20
0
 def setUp(self):
     with open('tests/fastqc_data.txt') as fp:
         self.p_data = Fadapa(fp)
예제 #21
0
 def setUp(self):
     self.p_data = Fadapa('tests/fastqc_data.txt')
예제 #22
0
 def setUp(self):
     self.p_data = Fadapa('tests/fastqc_data.txt')
예제 #23
0
 def test_fastqc_data_not_found(self):
     with self.assertRaises(FastqcDataError):
         Fadapa('tests/empty.zip')