예제 #1
0
 def test_intersect(self):
     isamp = self.gen_fake_isamp()
     series = Series('GSE0')
     sample_list = [Sample('GSM10', series), Sample('GSM20', series)]
     for __ in sample_list :
         series.add_passed_sample(__)
     self.assertEqual(ppr.intersect(series, isamp), sample_list)
예제 #2
0
    def test_gen_all_samples_from_soft_and_isamp(
            self, mock_get_isamp, mock_analyze_one, mock_sanity_check):
        mock_sanity_check.return_value = True
        mock_get_isamp.return_value = self.gen_fake_isamp()
        fake_series = Series('GSE0')
        sample_list = [Sample('GSM10', fake_series), Sample('GSM20', fake_series)]
        for __ in sample_list :
            __.organism = 'H**o Sapiens'
            fake_series.add_passed_sample(__)
        mock_analyze_one.return_value = sample_list

        self.assertEqual(ppr.gen_all_samples_from_soft_and_isamp(
            ['soft1'], 'isamp_file_or_str', {'INTERESTED_ORGANISMS': ['H**o Sapiens']}),
            sample_list)
예제 #3
0
 def test_add(self, mock_is_info_complete):
     series = Series('GSE123456', 'GSE123456_family.soft.subset')
     current_sample = Sample('GSM1', series)
     mock_is_info_complete.return_value = True
     self.assertEqual(soft_parser.add(current_sample, series, 1), 2)
     mock_is_info_complete.return_value = False
     self.assertEqual(soft_parser.add(current_sample, series, 1), 1)
예제 #4
0
 def test_intersect_with_discrenpacy(self, L):
     isamp = self.gen_fake_isamp()
     series = Series('GSE0')
     sample_list = [Sample('GSM10', series)]
     for __ in sample_list :
         series.add_passed_sample(__)
     self.assertEqual(ppr.intersect(series, isamp), sample_list)
     L.check(('rsempipeline.utils.pre_pipeline_run', 'ERROR',
              'Discrepancy for GSE0: 1 GSMs in soft, 2 GSMs in isamp, and only 1 left after intersection.'),)
예제 #5
0
 def test_analyze_one(self, mock_parse, L):
     fake_isamp = self.gen_fake_isamp()
     fake_series = Series('GSE0')
     sample_list = [Sample('GSM10', fake_series)]
     for __ in sample_list :
         __.organism = 'H**o Sapiens'
         fake_series.add_passed_sample(__)
     mock_parse.return_value = fake_series
     self.assertEqual(ppr.analyze_one('GSE0_family.soft.subset', fake_isamp, ['H**o sapiens']),
                      sample_list)
     L.check(('rsempipeline.utils.pre_pipeline_run', 'ERROR',
              'Discrepancy for GSE0: 1 GSMs in soft, 2 GSMs in isamp, and only 1 left after intersection.'),)
예제 #6
0
    def test_gen_orig_params_per_with_a_single_sra(self):
        # mock a series and sample
        series = Series('GSE123456', 'GSE123456_family.soft.subset')
        sample = Sample('GSM1', series)
        sample.outdir = 'some_outdir/GSE123456/some_species/GSM1'
        series.add_passed_sample(sample)

        with mock.patch('rsempipeline.utils.download.open',
                        mock.mock_open(read_data=SRA_INFO_YAML_SINGLE_SRA)):
            vals = download.gen_orig_params_per(sample)
        self.assertEqual(vals, [[
            None,
            [
                'some_outdir/GSE123456/some_species/GSM1/SRX685892/SRR1557065/SRR1557065.sra',
                'some_outdir/GSE123456/some_species/GSM1/SRR1557065.sra.download.COMPLETE'
            ], sample
        ]])
예제 #7
0
    def test_gen_orig_params_per_with_multiple_sras(self):
        # mock a series and sample
        series = Series('GSE123456', 'GSE123456_family.soft.subset')
        sample = Sample('GSM1', series)
        sample.outdir = 'some_outdir/GSE123456/some_species/GSM1'
        series.add_passed_sample(sample)

        with mock.patch('rsempipeline.utils.download.open',
                        mock.mock_open(read_data=SRA_INFO_YAML_MULTIPLE_SRAS)):
            vals = download.gen_orig_params_per(sample)
        self.assertEqual(
            vals,
            [
                # in the format of input, outputs, other params
                [
                    None,
                    [
                        'some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453140/SRR453140.sra',
                        'some_outdir/GSE123456/some_species/GSM1/SRR453140.sra.download.COMPLETE'
                    ], sample
                ],
                [
                    None,
                    [
                        'some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453141/SRR453141.sra',
                        'some_outdir/GSE123456/some_species/GSM1/SRR453141.sra.download.COMPLETE'
                    ], sample
                ],
                [
                    None,
                    [
                        'some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453142/SRR453142.sra',
                        'some_outdir/GSE123456/some_species/GSM1/SRR453142.sra.download.COMPLETE'
                    ], sample
                ],
                [
                    None,
                    [
                        'some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453143/SRR453143.sra',
                        'some_outdir/GSE123456/some_species/GSM1/SRR453143.sra.download.COMPLETE'
                    ], sample
                ]
            ])
예제 #8
0
def parse(soft_file, interested_organisms):
    """Parse the soft file
    :param interested_organisms: a list of interested organisms: ['H**o
                                 sapiens', 'Mus musculus']
    """
    logger.info("Parsing file: {0} ...".format(soft_file))
    series_name_from_file = get_series_name_from(soft_file)
    print series_name_from_file

    # Assume one GSE per soft file
    # index: the index of all passed samples, unpassed samples are not indexed
    index, series, current_sample = 1, None, None
    with open(soft_file, 'rb') as inf:
        for line in inf:
            label, value = [__.strip() for __ in line.split('=')]
            if label == '^SERIES':
                series = Series(value, os.path.abspath(soft_file))
                if series.name != series_name_from_file:
                    msg = ('series contained in the soft file doesn\'t match '
                           'that in the filename: {0} != {1}'.format(
                               series, series_name_from_file))
                    raise ValueError(msg)
            elif label == '^SAMPLE':
                index = add(current_sample, series, index)
                current_sample = Sample(name=value, series=series)

            if current_sample:
                current_sample = update(current_sample, label, value,
                                        interested_organisms)
        if series is not None:
            # add the last sample
            add(current_sample, series, index)

            logger.info("{0}: {1}/{2} samples passed".format(
                series.name, series.num_passed_samples(),
                series.num_samples()))
            logger.info('=' * 30)
            return series