Exemplo n.º 1
0
def stasis():
    client = StasisClient(os.getenv('STASIS_URL'),
                          os.getenv('STASIS_API_TOKEN'))

    def mock_sample_as_json(sample_name):
        """
        mocking this method out to provide standardized access to the json data
        in a versioned scope
        """
        parent = Path(__file__).resolve().parent

        print(f'mocking call by loading from file directly: {sample_name}')
        try:
            if 'mzml.json' not in sample_name:
                sample_name = f'{sample_name}.mzml.json'
            with open(f'{parent}/data/{sample_name}', 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            print(f'File {parent}/data/{sample_name} not found')
            return None

    client.sample_result_as_json = mock_sample_as_json
    assert "https://test-api.metabolomics.us/stasis" == client.get_url()
    print(f"raw:  {client.get_raw_bucket()}")
    print(f"json: {client.get_processed_bucket()}")
    print(f"zip:  {client.get_aggregated_bucket()}")
    return client
Exemplo n.º 2
0
    def __init__(self, job_store: JobStore):
        super().__init__(job_store)
        self.stasis_cli = StasisClient()

        self.stasis_states = self.stasis_cli.get_states()

        # static mapping to the stasis states
        self.state_map = {
            SampleState.PROCESSED: self.stasis_states['exported'],
            SampleState.FAILED: self.stasis_states['failed'],
            SampleState.SCHEDULED: self.stasis_states['entered']

        }
Exemplo n.º 3
0
class StasisDataLoader(DataLoader):
    def __init__(self, url: str = None, token: str = None, client: StasisClient = None):
        self.url = url
        self.token = token
        self.client = StasisClient(url, token)

    def _do_load(self, name: str) -> dict:
        sample = self.client.sample_result_as_json(name)
        return sample

    def exists(self, name: str) -> bool:
        if self.client.sample_state(name)[-1]['value'] == 'exported':
            return True
        else:
            return False
Exemplo n.º 4
0
    def __init__(self,
                 args: dict,
                 stasis: Optional[StasisClient] = None,
                 disable_progress_bar=False):
        if isinstance(args, Namespace):
            args = vars(args)

        self.args = args

        if stasis:
            self.stasis_cli = stasis
        else:
            self.stasis_cli = StasisClient()

        self.bucket_used = self.stasis_cli.get_processed_bucket()
        self.disable_progress_bar = disable_progress_bar
Exemplo n.º 5
0
def create_stasis_instance(config):
    if 'env' in config and config['env'] == 'prod':
        url = 'https://api.metabolomics.us/stasis'
    else:
        url = f'https://{config["env"].lower()}-api.metabolomics.us/stasis'
    key_name = f'{config["env"].upper()}_STASIS_API_TOKEN'
    key = os.environ[key_name].strip()

    print(f'Stasis api address: {url}')
    print(f'Stasis api key from: {key_name} = {key}')

    return StasisClient(url, key)
Exemplo n.º 6
0
class JobSampleStateService(SampleStateService):
    """
    utilizes stasis to keep track of sample states of the scheduled jobs.
    """

    def __init__(self, job_store: JobStore):
        super().__init__(job_store)
        self.stasis_cli = StasisClient()

        self.stasis_states = self.stasis_cli.get_states()

        # static mapping to the stasis states
        self.state_map = {
            SampleState.PROCESSED: self.stasis_states['exported'],
            SampleState.FAILED: self.stasis_states['failed'],
            SampleState.SCHEDULED: self.stasis_states['entered']

        }

    def set_state(self, id: str, sample_name: str, state: SampleState):
        pass

    def _state_sample(self, id: str, sample_name: str) -> SampleState:
        pass
Exemplo n.º 7
0
 def __init__(self, url: str = None, token: str = None, client: StasisClient = None):
     self.url = url
     self.token = token
     self.client = StasisClient(url, token)
Exemplo n.º 8
0
def stasis_cli():
    return StasisClient(os.getenv('STASIS_URL'), os.getenv('STASIS_API_TOKEN'))
Exemplo n.º 9
0
                        '--upload',
                        help='uploads results to S3',
                        action='store_true',
                        default=True)

    parser.add_argument('--mz-tolerance',
                        help='m/z alignment tolerance',
                        type=float,
                        default=0.01)
    parser.add_argument('--rt-tolerance',
                        help='retention time alignment tolerance',
                        type=float,
                        default=0.1)

    parser.add_argument("-x",
                        "--unknowns",
                        help="include unconfirmed targets",
                        required=False,
                        action="store_true",
                        default=False)
    return parser


if __name__ == '__main__':
    parser = create_parser()
    args = parser.parse_args()
    stasis = StasisClient(url=os.getenv('STASIS_URL', None),
                          token=os.getenv('STASIS_TOKEN', None))
    JobAggregator(vars(args), stasis=stasis).aggregate_job(job=args.job,
                                                           upload=args.upload)
Exemplo n.º 10
0
class Aggregator:
    def __init__(self,
                 args: dict,
                 stasis: Optional[StasisClient] = None,
                 disable_progress_bar=False):
        if isinstance(args, Namespace):
            args = vars(args)

        self.args = args

        if stasis:
            self.stasis_cli = stasis
        else:
            self.stasis_cli = StasisClient()

        self.bucket_used = self.stasis_cli.get_processed_bucket()
        self.disable_progress_bar = disable_progress_bar

    def find_intensity(self, value) -> int:
        """
        Returns the intensity value only for replaced data
        Args:
            value:

        Returns:

        """
        try:
            if not value['replaced'] or (
                    value['replaced']
                    and not self.args.get('exclude_replacement', False)):
                return round(value['intensity'])
            else:
                return 0
        except Exception as e:
            raise e

    @staticmethod
    def find_replaced(value) -> int:
        """
        Returns the intensity value only for replaced data
        Args:
            value:

        Returns:

        """
        if value['replaced']:
            return round(value['intensity'])
        else:
            return 0

    @staticmethod
    def add_metadata(samples: DataFrame, data: List):
        """
        Creates the column headers with sample metadata
        Args:
            samples: list of sample names
            data: dataframe with result data

        Returns: a dataframe with metadata on top of sample results

        """
        dicdata = {
            'Id': None,
            'Label': None,
            'Splash': None,
            'Target RI(s)': None,
            'Target mz': None,
            'Target Type': None,
            'found %': None,
            'InChIKey': ['species', 'organ', 'batch', 'sample_type', 'time']
        }
        for sample, idx in zip(samples, range(1, len(samples) + 1)):
            filtered_sample = list(
                filter(lambda x: x.get('sample', None) == sample, data))

            if len(filtered_sample) > 0:
                if '_qc' in sample.lower():
                    sample_type = 'qc'
                elif '_nist' in sample.lower():
                    sample_type = 'nist'
                else:
                    sample_type = 'sample'

                # could be also done over [0] since we should never have duplicated samples anyway
                for x in filtered_sample:
                    species = x.get('metadata', {}).get('species', '')
                    organ = x.get('metadata', {}).get('organ', '')
                    dicdata[sample] = [species, organ, '', sample_type, idx]
            else:
                # missing sample
                dicdata[sample] = ['', '', '', '', idx]
        else:
            result = pd.DataFrame(dicdata)
        return result

    def export_excel(self, data, type, infile, sort_index=False):
        """
        Saves a dataframe to excel format
        Args:
            data: the dataframe to export
            type: the name of the excel sheet
            infile: filename of the result to use in excel filename
            sort_index: sort the data on reindexing, True | False

        Returns:
        """

        # saving excel file
        # print(f'{time.strftime("%H:%M:%S")} - Exporting excel file {type}')
        file, ext = os.path.splitext(infile)

        # Build suffix
        if self.args.get('test', False):
            suffix = 'testResults'
        else:
            suffix = 'results'

        if self.args.get('exclude_replacement', False):
            suffix += '-norepl'
        else:
            suffix += '-repl'

        separator = '' if file.endswith("/") else '/'
        output_name = f'{file}{separator}{type.lower().replace(" ", "_")}-{suffix}.xlsx'

        if type == 'Correction curve':
            data.dropna(inplace=True)
            data.reset_index(drop=True, inplace=True)
            # print(data)

        if sort_index:
            reindexed = data.set_index('Id').sort_index()
        else:
            reindexed = data.set_index('Id')

        writer = pd.ExcelWriter(output_name)
        reindexed.fillna('').to_excel(writer, type)
        writer.save()

        print(f'Saved file {output_name}')

    @staticmethod
    def calculate_average(intensity, mass, rt, origrt, biorecs):
        """
        UNUSED
        Calculates the average intensity, mass and retention index of biorecs

        Argt:
            intensity:
            mass:
            rt:
            origrt:
            biorecs:

        Returns:

        """
        # print(f'{time.strftime("%H:%M:%S")} - Calculating average of biorecs for intensity, '
        #       f'mass and RT (ignoring missing results)')
        np.seterr(invalid='log')

        for i in range(len(intensity)):
            intensity.loc[i, AVG_BR_] = intensity.loc[i, biorecs].mean()
            mass.loc[i, AVG_BR_] = mass.loc[i, biorecs].mean()
            rt.loc[i, AVG_BR_] = rt.loc[i, biorecs].mean()
            origrt.loc[i, AVG_BR_] = 'NA'

    @staticmethod
    def calculate_rsd(intensity, mass, rt, origrt, biorecs):
        """
        UNUSED
        Calculates intensity, mass and retention index Relative Standard Deviation of biorecs
        Args:
            intensity:
            mass:
            rt:
            origrt:
            biorecs:

        Returns:

        """
        # print(f'{time.strftime("%H:%M:%S")} - Calculating %RDS of biorecs for intensity, '
        #       f'mass and RT (ignoring missing results)')
        size = range(len(intensity))
        np.seterr(invalid='log')

        for i in size:
            try:
                intensity.loc[
                    i, RSD_BR_] = (intensity.loc[i, biorecs].std() /
                                   intensity.loc[i, biorecs].mean()) * 100
            except Exception as e:
                print(
                    f'{time.strftime("%H:%M:%S")} - Can\'t calculate % RSD for target {intensity.loc[i, "name"]}.'
                    f' Sum of intensities = {intensity.loc[i, biorecs].sum()}')
                pass

            mass.loc[i, RSD_BR_] = (mass.loc[i, biorecs].std() /
                                    mass.loc[i, biorecs].mean()) * 100
            rt.loc[i, RSD_BR_] = (rt.loc[i, biorecs].std() /
                                  rt.loc[i, biorecs].mean()) * 100
            origrt.loc[i, RSD_BR_] = 'NA'

    def format_sample(self, sample):
        """
        Filters the incoming sample data separating intensity, mass, retention index and replacement values

        Args:
            sample: sample result file

        Returns:

        """
        intensities = []
        masses = []
        rts = []
        origrts = []
        curve = []
        replaced = []
        msms = []

        def debug(value, type):
            if isinstance(value, type) is False:
                raise Exception(f'invalid type: {value} - {type}')
            return value

        for k, v in sample['injections'].items():
            intensities = {
                k:
                [self.find_intensity(r['annotation']) for r in v['results']]
            }
            masses = {
                k: [round(r['annotation']['mass'], 4) for r in v['results']]
            }
            rts = {
                k: [
                    round(debug(r['annotation']['retentionIndex'], float), 2)
                    for r in v['results']
                ]
            }
            origrts = {
                k: [
                    round(r['annotation']['nonCorrectedRt'], 2)
                    for r in v['results']
                ]
            }
            replaced = {
                k: [self.find_replaced(r['annotation']) for r in v['results']]
            }
            curve = {k: sample['injections'][k]['correction']['curve']}
            msms = {k: [r['annotation'].get('msms', '') for r in v['results']]}

        return [None, intensities, masses, rts, origrts, curve, replaced, msms]

    @staticmethod
    def build_worksheet(
        targets,
        upb,
        label=' working...',
    ):
        """
        Structures the data to be 'worksheet' ready
        Args:
            targets: list of targets
            upb: show progress bar, True or False
            label: progress bar message

        Returns: a dataframe formatted with the first columns of a final report

        """
        rows = []
        pattern = re.compile('.*?_[A-Z]{14}-[A-Z]{10}-[A-Z]')

        i = 1
        bar = tqdm.tqdm(targets, desc=label, unit=' targets', disable=upb)
        for x in bar:
            try:
                rows.append({
                    'Id':
                    i,
                    'Label':
                    x['name'],
                    'Splash':
                    x.get('splash', ''),
                    'Target RI(s)':
                    x['retentionTimeInSeconds'],
                    'Target mz':
                    x['mass'],
                    'Target Type':
                    x['targetType'],
                    'InChIKey':
                    x['name'].split('_')[-1]
                    if pattern.match(x['name']) else None
                })
            except TypeError as e:
                bar.write(f'Error adding {x} to the result set. {e.args}')
            finally:
                i = i + 1

        df = pd.DataFrame(rows)  # .set_index('ID')

        return df[TARGET_COLUMNS]

    @staticmethod
    def build_target_identifier(target):
        return f"{target['name']}_{target['retentionTimeInSeconds'] / 60:.2f}_{target['mass']:.4f}"

    def process_sample_list(self, samples, destination):
        """
        Runs the aggregation pipeline on the list of samples
        Args:
            samples: list of sample names
            destination:

        Returns:
        """
        # use subset of samples for testing
        if self.args.get('test', False):
            samples = samples[:5]

        # creating target list
        results = []

        os.makedirs(f'{destination}/json', exist_ok=True)

        dir = self.args.get('dir', '/tmp')

        if not os.path.exists(dir):
            print("sorry your specified path didn't exist, we can't continue!")
            raise FileNotFoundError(dir)

        print(f'looking for local data in directory: {dir}')
        # print(f'using bucket {self.stasis_cli.get_processed_bucket()} for remote downloads')

        sbar = tqdm.tqdm(samples,
                         desc='Getting results',
                         unit=' samples',
                         disable=self.disable_progress_bar)
        for sample in sbar:
            sbar.set_description(sample)
            if sample in ['samples']:
                continue

            result_file = f'{sample}'
            saved_result = f'{dir}/{result_file}.mzml.json'

            sbar.write(f'looking for {result_file}')
            sbar.write(f'looking for {saved_result}')
            if self.args.get('save') or not os.path.exists(saved_result):
                sbar.write(
                    f'downloading result data from stasis for {sample}, '
                    f'due to file {saved_result} not existing locally at')
                try:
                    resdata = self.stasis_cli.sample_result_as_json(
                        result_file)
                    sbar.write("\t\t=> successfully downloaded data file")
                except Exception as e:

                    try:
                        print(
                            f'we observed an error during downloading the data file: {str(e)}, using backup approach'
                        )

                        retry = 10
                        while retry > 0:
                            try:
                                resdata = self.stasis_cli.sample_result_as_json(
                                    result_file, f"{result_file}.mzml.json")
                                sbar.write(
                                    "\t\t=> successfully downloaded data file -> using explicit file handle"
                                )
                                retry = retry - 1
                            except Exception as ex:
                                time.sleep(1)
                                raise ex
                    except Exception as exe:
                        print(
                            f'we observed an error during downloading the data file: {str(result_file)}. '
                            f'Exception type was {str(exe)}')
                        resdata = None
            else:
                sbar.write(f'loading existing result data from {saved_result}')
                with open(saved_result, 'rb') as data:
                    resdata = json.load(data)
                    sbar.write("\t\t=> successfully loaded existing data file")

            if resdata is None:
                sbar.write(
                    f'Failed getting {sample}. We looked in bucket {self.bucket_used}'
                )
            elif resdata == '':
                sbar.write(
                    f'the result received for {sample} was empty. This is not acceptable!!! Designated local file is {result_file} located at {dir}'
                )
            elif resdata and resdata.get('Error') is None:
                results.append(resdata)
                with bz2.BZ2File(f'{destination}/json/{sample}.mzml.json.bz2',
                                 'w',
                                 compresslevel=9) as outfile:
                    d = json.dumps(resdata, indent=4)
                    outfile.write(d.encode())
            else:
                raise Exception('this should not have happened!')

        if len(results) == 0:
            print(
                'we did not manage to discover any of the calculation data for this job!'
            )
            raise NoSamplesFoundException(
                'sorry none of your samples were found!')
        targets = self.get_target_list(results)

        # creating spreadsheets
        intensity = self.build_worksheet(targets,
                                         upb=self.disable_progress_bar,
                                         label='intensity matrix')
        if self.args.get('extra_files', False):
            mass = self.build_worksheet(targets,
                                        upb=self.disable_progress_bar,
                                        label='mass matrix')
            rt = self.build_worksheet(targets,
                                      upb=self.disable_progress_bar,
                                      label='RI matrix')
            origrt = self.build_worksheet(targets,
                                          upb=self.disable_progress_bar,
                                          label='RT matrix')
            curve = self.build_worksheet(targets,
                                         upb=self.disable_progress_bar,
                                         label='curve data')
            replaced = self.build_worksheet(targets,
                                            upb=self.disable_progress_bar,
                                            label='replacement matrix')
            msms = self.build_worksheet(targets,
                                        upb=self.disable_progress_bar,
                                        label='MSMS Spectra')

        # populating spreadsheets
        for data in tqdm.tqdm(results,
                              desc='Formatting results',
                              unit=' samples',
                              disable=self.disable_progress_bar):
            sample = data['sample']

            if 'error' not in data:
                formatted = self.format_sample(data)

                intensity[sample] = pd.DataFrame(formatted[1])
                if self.args.get('extra_files', False):
                    mass[sample] = pd.DataFrame(formatted[2])
                    rt[sample] = pd.DataFrame(formatted[3])
                    origrt[sample] = pd.DataFrame(formatted[4])
                    curve[sample] = pd.DataFrame(formatted[5])
                    replaced[sample] = pd.DataFrame(formatted[6])
                    msms[sample] = pd.DataFrame(formatted[7])
            else:
                sbar.write('Error in data')
                intensity[sample] = np.nan
                if self.args.get('extra_files', False):
                    mass[sample] = np.nan
                    rt[sample] = np.nan
                    origrt[sample] = np.nan
                    curve[sample] = np.nan
                    replaced[sample] = np.nan
                    msms[sample] = np.nan

        if not self.args.get('keep_msms') and self.args.get(
                'extra_files', False):
            self.filter_msms(msms, intensity)

        # biorecs = [br for br in intensity.columns if 'biorec' in str(br).lower() or 'qc' in str(br).lower()]
        pd.set_option('display.max_rows', 100)
        pd.set_option('display.max_columns', 15)
        pd.set_option('display.width', 1000)

        try:
            discovery = intensity[
                intensity.columns[len(TARGET_COLUMNS):]].apply(
                    lambda row: row.dropna()[row > 0].count() / len(row.dropna(
                    )),
                    axis=1)
            intensity.insert(loc=len(TARGET_COLUMNS) + 1,
                             column='found %',
                             value=discovery)
        except Exception as e:
            print(f'Error in discovery calculation: {str(e.args)}')

        md = self.add_metadata(samples, results)
        intensity = pd.concat([md, intensity],
                              sort=False).reset_index(drop=True)

        sheet_names['intensity'].append(intensity)
        if self.args.get('extra_files', False):
            sheet_names['mass'].append(mass)
            sheet_names['ri'].append(origrt)
            sheet_names['rt'].append(rt)
            sheet_names['repl'].append(replaced)
            sheet_names['curve'].append(curve)
            sheet_names['msms'].append(msms)

        print(f'\nSaving results to {destination}')

        if self.args.get('extra_files', False):
            for t in [sheet_names[k] for k in sheet_names.keys()]:
                print(t)
                try:
                    self.export_excel(t[1], t[0], destination)
                except Exception as exerr:
                    print(f'Error creating excel file for {t}')
                    print(str(exerr))
        else:
            try:
                self.export_excel(sheet_names['intensity'][1],
                                  sheet_names['intensity'][0], destination)
            except Exception as exerr:
                print(str(exerr))

    def filter_msms(self, msms, intensity):

        indices = intensity.iloc[:, len(TARGET_COLUMNS):].idxmax(axis=1)

        reducedMSMS = msms.apply(lambda x: x[indices[x['Id'] - 1]], axis=1)
        msms.drop(msms.columns[len(TARGET_COLUMNS):], axis=1, inplace=True)
        msms['MSMS Spectrum'] = reducedMSMS
        return msms

    def get_target_list(self, results):
        """
        Returns the list of targets from a result file
        Args:
            results: result data with target info

        Returns: list of targets
        """

        targets = [
            x['target'] for x in [
                results[0]['injections'][k]['results']
                for k in list(results[0]['injections'].keys())
            ][0]
        ]

        if not self.args.get('unknowns'):
            print('\nOnly saving confirmed targets\n')
            targets = list(
                filter(lambda x: x['targetType'] != 'UNCONFIRMED', targets))

        print(f'Found {len(targets)} targets.')
        return targets

    def aggregate(self):
        """
        Collects information on the experiment and decides if aggregation of the full experiment is possible

        Returns: the filename of the aggregated (excel) file
        """

        if 'infiles' not in self.args:
            raise KeyError(
                "sorry you need to specify at least one input file for this function"
            )

        for sample_file in self.args['infiles']:
            if not os.path.isfile(sample_file):
                raise FileNotFoundError(
                    f'file name {sample_file} does not exist')

            suffix = os.path.splitext(os.path.split(sample_file)[-1])[0]
            dest = self.args.get(
                'dir',
                './') + f'/{suffix}' if 'dir' in self.args else f'./{suffix}'

            with open(sample_file) as processed_samples:
                samples = [
                    p.split(',')[0]
                    for p in processed_samples.read().strip().splitlines() if p
                ]
                self.aggregate_samples(samples, dest)

    def aggregate_samples(self, samples: List[str], destination: str):
        """
        Aggregates the samples at the specified destination
        Args:
            samples: list of sample names to aggregate
            destination: folder to save reports on
        Returns:
        """
        if os.path.exists(destination) is False:
            print(f'Creating destination folder: {destination}')
            os.makedirs(destination, exist_ok=True)

        if samples[0].startswith('samples'):
            samples = samples[1:]

        self.process_sample_list(samples, destination)
Exemplo n.º 11
0
def stasis_client(stasis_vars):
    stasisclient = StasisClient(**stasis_vars)
    return stasisclient