예제 #1
0
    def copy_input_data(self, input_data_path, ds_config_path):
        """ Copy imzML/ibd/config files from input path to a dataset work directory

        Args
        ----
        input_data_path : str
            Path to input files
        """
        # if self.local_fs_only:
        #     ex = self.local_dir.exists(self.local_dir.txt_path)
        # else:
        #     ex = self.remote_dir.exists(self.remote_dir.txt_path)
        if not self.local_dir.exists(self.local_dir.imzml_path):
            logger.info('Copying data from %s to %s', input_data_path,
                        self.local_dir.ds_path)

            if input_data_path.startswith('s3a://'):
                cmd_check('mkdir -p {}', self.local_dir.ds_path)
                bucket_name, inp_path = split_s3_path(input_data_path)

                bucket = self.s3.Bucket(bucket_name)
                for obj in bucket.objects.filter(Prefix=inp_path):
                    if not obj.key.endswith('/'):
                        path = join(self.local_dir.ds_path,
                                    obj.key.split('/')[-1])
                        self.s3transfer.download_file(bucket_name, obj.key,
                                                      path)
            else:
                self.local_dir.copy(input_data_path, self.local_dir.ds_path)

        if ds_config_path:
            self.local_dir.copy(ds_config_path,
                                self.local_dir.ds_config_path,
                                is_file=True)
예제 #2
0
    def copy_input_data(self, input_data_path, ds_config_path):
        """ Copy imzML/ibd/config files from input path to a dataset work directory

        Args
        ----
        input_data_path : str
            Path to input files
        """
        # if self.local_fs_only:
        #     ex = self.local_dir.exists(self.local_dir.txt_path)
        # else:
        #     ex = self.remote_dir.exists(self.remote_dir.txt_path)
        if not self.local_dir.exists(self.local_dir.imzml_path):
            logger.info('Copying data from %s to %s', input_data_path, self.local_dir.ds_path)

            if input_data_path.startswith('s3a://'):
                cmd_check('mkdir -p {}', self.local_dir.ds_path)
                bucket_name, inp_path = split_s3_path(input_data_path)

                bucket = self.s3.Bucket(bucket_name)
                for obj in bucket.objects.filter(Prefix=inp_path):
                    if not obj.key.endswith('/'):
                        path = join(self.local_dir.ds_path, obj.key.split('/')[-1])
                        self.s3transfer.download_file(bucket_name, obj.key, path)
            else:
                self.local_dir.copy(input_data_path, self.local_dir.ds_path)

        self.local_dir.copy(ds_config_path, self.local_dir.ds_config_path, is_file=True)
예제 #3
0
    def estimate_fdr(self, msm_df):
        logger.info('Estimating FDR...')

        target_fdr_df_list = []
        for ta in self.target_adducts:
            target_msm = msm_df.loc(axis=0)[:, ta]

            msm_fdr_list = []
            for i in range(self.decoy_sample_size):
                sf_da_list = map(
                    tuple, self.td_df[self.td_df.ta == ta][[
                        'sf_id', 'da'
                    ]][i::self.decoy_sample_size].values)
                decoy_msm = msm_df.loc[sf_da_list]
                msm_fdr = self._msm_fdr_map(target_msm, decoy_msm)
                msm_fdr_list.append(msm_fdr)

            msm_fdr_avg = pd.Series(pd.concat(msm_fdr_list,
                                              axis=1).median(axis=1),
                                    name='fdr')
            target_fdr = self._digitize_fdr(
                target_msm.join(msm_fdr_avg, on='msm'))
            target_fdr_df_list.append(target_fdr.drop('msm', axis=1))

        return pd.concat(target_fdr_df_list, axis=0)
예제 #4
0
    def __init__(self, job_id, db_id, ds_config, db):
        self.job_id = job_id
        self.db_id = db_id
        self.ppm = ds_config['image_generation']['ppm']
        iso_gen_conf = ds_config['isotope_generation']
        charge = '{}{}'.format(iso_gen_conf['charge']['polarity'],
                               iso_gen_conf['charge']['n_charges'])

        target_sf_peaks_rs = db.select(THEOR_PEAKS_TARGET_ADD_SEL, self.db_id,
                                       iso_gen_conf['adducts'],
                                       iso_gen_conf['isocalc_sigma'],
                                       iso_gen_conf['isocalc_pts_per_mz'],
                                       charge)
        assert target_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (target)'

        decoy_sf_peaks_rs = db.select(THEOR_PEAKS_DECOY_ADD_SEL, self.job_id,
                                      self.db_id,
                                      iso_gen_conf['isocalc_sigma'],
                                      iso_gen_conf['isocalc_pts_per_mz'],
                                      charge)
        assert decoy_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (decoy)'

        sf_peak_rs = target_sf_peaks_rs + decoy_sf_peaks_rs
        self.sf_ids, self.adducts, self.sf_theor_peaks, self.sf_theor_peak_ints = zip(
            *sf_peak_rs)
        self.check_formula_uniqueness(self.sf_ids, self.adducts)

        logger.info('Loaded %s sum formulas from the DB', len(self.sf_ids))
예제 #5
0
 def search(self):
     logger.info('Running molecule search')
     sf_images = compute_sf_images(self.sc, self.ds, self.formulas.get_sf_peak_df(),
                                   self.ds_config['image_generation']['ppm'])
     all_sf_metrics_df = self.calc_metrics(sf_images)
     sf_metrics_fdr_df = self.estimate_fdr(all_sf_metrics_df)
     sf_metrics_fdr_df = self.filter_sf_metrics(sf_metrics_fdr_df)
     return sf_metrics_fdr_df, self.filter_sf_images(sf_images, sf_metrics_fdr_df)
예제 #6
0
 def store_sf_img_metrics(self):
     """ Store formula image metrics in the database """
     logger.info('Storing iso image metrics')
     rows = list(
         self._metrics_table_row_gen(self.job_id, self.sf_db_id,
                                     self.sf_metrics_df,
                                     self.sf_adduct_peaksn, self.metrics))
     self.db.insert(METRICS_INS, rows)
예제 #7
0
    def index_ds(self, db, ds_name, db_name):
        annotations = db.select(RESULTS_TABLE_SQL, ds_name, db_name)

        logger.info('Deleting documents from the index: {}-{}'.format(ds_name, db_name))
        self._delete(annotations)

        logger.info('Indexing documents: {}-{}'.format(ds_name, db_name))
        self._index(annotations)
예제 #8
0
 def clean(self):
     try:
         bucket_obj = self.s3.Bucket(self.bucket)
         for obj in bucket_obj.objects.filter(Prefix=self.ds_path):
             self.s3.Object(self.bucket, obj.key).delete()
         logger.info('Successfully deleted interim data')
     except CalledProcessError as e:
         logger.warning('Deleting interim data files error: %s', e.message)
예제 #9
0
 def clean(self):
     try:
         bucket_obj = self.s3.Bucket(self.bucket)
         for obj in bucket_obj.objects.filter(Prefix=self.ds_path):
             self.s3.Object(self.bucket, obj.key).delete()
         logger.info('Successfully deleted interim data')
     except CalledProcessError as e:
         logger.warning('Deleting interim data files error: %s', e.message)
예제 #10
0
    def index_ds(self, db, ds_name, db_name):
        annotations = db.select(RESULTS_TABLE_SQL, ds_name, db_name)

        logger.info('Deleting documents from the index: {}-{}'.format(ds_name, db_name))
        self._delete(annotations)

        logger.info('Indexing documents: {}-{}'.format(ds_name, db_name))
        self._index(annotations)
예제 #11
0
 def search(self):
     logger.info('Running molecule search')
     sf_images = compute_sf_images(
         self.sc, self.ds, self.formulas.get_sf_peak_df(),
         self.ds_config['image_generation']['ppm'])
     all_sf_metrics_df = self.calc_metrics(sf_images)
     sf_metrics_fdr_df = self.estimate_fdr(all_sf_metrics_df)
     sf_metrics_fdr_df = self.filter_sf_metrics(sf_metrics_fdr_df)
     return sf_metrics_fdr_df, self.filter_sf_images(
         sf_images, sf_metrics_fdr_df)
예제 #12
0
    def _valid_sf_adduct(cls, sf, adduct):
        if sf is None or adduct is None or sf == 'None' or adduct == 'None':
            logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct)
            return False

        if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf):
            logger.info('No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct)
            return False

        return True
예제 #13
0
 def exists(self, path):
     try:
         self.s3.Object(*split_s3_path(path)).load()
     except botocore.exceptions.ClientError as e:
         if e.response['Error']['Code'] == "404":
             return False
         else:
             raise e
     else:
         logger.info('Path s3://%s/%s already exists', self.bucket, path)
         return True
예제 #14
0
    def _import_theor_peaks_to_db(self, peak_lines):
        logger.info('Saving new peaks to the DB')
        if not exists(self.theor_peaks_tmp_dir):
            makedirs(self.theor_peaks_tmp_dir)

        peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv')
        with open(peak_lines_path, 'w') as f:
            f.write('\n'.join(peak_lines))

        with open(peak_lines_path) as peaks_file:
            self.db.copy(peaks_file, 'theor_peaks')
예제 #15
0
    def _import_theor_peaks_to_db(self, peak_lines):
        logger.info('Saving new peaks to the DB')
        if not exists(self.theor_peaks_tmp_dir):
            makedirs(self.theor_peaks_tmp_dir)

        peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv')
        with open(peak_lines_path, 'w') as f:
            f.write('\n'.join(peak_lines))

        with open(peak_lines_path) as peaks_file:
            self.db.copy(peaks_file, 'theor_peaks')
예제 #16
0
 def exists(self, path):
     try:
         self.s3.Object(*split_s3_path(path)).load()
     except botocore.exceptions.ClientError as e:
         if e.response['Error']['Code'] == "404":
             return False
         else:
             raise e
     else:
         logger.info('Path s3://%s/%s already exists', self.bucket, path)
         return True
예제 #17
0
    def store_job_meta(self):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0])
        self.job_id = self.ds_id
        self.db.alter(DEL_JOB_SQL, self.job_id)
        rows = [(self.job_id, self.sf_db_id, self.ds_id,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self.db.insert(JOB_INS, rows)

        rows = [(self.job_id, adduct)
                for adduct in self.ds_config['isotope_generation']['adducts']]
        self.db.insert(ADDUCT_INS, rows)
예제 #18
0
 def get_spectra(self):
     """
     Returns
     -------
     : pyspark.rdd.RDD
         Spark RDD with spectra. One spectrum per RDD entry.
     """
     txt_to_spectrum = self.txt_to_spectrum_non_cum
     # if self.sm_config['fs']['local']:
     logger.info('Converting txt to spectrum rdd from %s',
                 self.wd_manager.txt_path)
     return self.sc.textFile(self.wd_manager.txt_path,
                             minPartitions=8).map(txt_to_spectrum)
예제 #19
0
    def _valid_sf_adduct(cls, sf, adduct):
        if sf is None or adduct is None or sf == 'None' or adduct == 'None':
            logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s',
                           sf, adduct)
            return False

        if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf):
            logger.info(
                'No negative adduct element in the sum formula: sf=%s, adduct=%s',
                sf, adduct)
            return False

        return True
예제 #20
0
    def apply_database_filters(self, formula_list):
        """ Filters according to settings in dataset config

        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through

        Returns
        -------
        : list
            Filtered list of pairs (id, sum formula)
        """
        if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])):
            logger.info('Organic sum formula filter has been applied')
            return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list)
        return formula_list
예제 #21
0
    def run(self):
        """ Starts peaks generation. Checks all formula peaks saved in the database and
        generates peaks only for new ones"""
        logger.info('Running theoretical peaks generation')

        db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0]
        formula_list = self.apply_database_filters(self.db.select(AGG_FORMULA_SEL, db_id))

        stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id,
                                          self.isocalc_wrapper.sigma,
                                          self.isocalc_wrapper.charge,
                                          self.isocalc_wrapper.pts_per_mz)

        sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct))
        logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand))

        if sf_adduct_cand:
            self.generate_theor_peaks(sf_adduct_cand)
예제 #22
0
    def apply_database_filters(self, formula_list):
        """ Filters according to settings in dataset config

        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through

        Returns
        -------
        : list
            Filtered list of pairs (id, sum formula)
        """
        if 'organic' in map(lambda s: s.lower(),
                            self.ds_config['database'].get('filters', [])):
            logger.info('Organic sum formula filter has been applied')
            return filter(lambda (_, sf): 'C' in self._sf_elements(sf),
                          formula_list)
        return formula_list
예제 #23
0
파일: fdr.py 프로젝트: frulo/SM_distributed
    def estimate_fdr(self, msm_df):
        logger.info('Estimating FDR...')

        target_fdr_df_list = []
        for ta in self.target_adducts:
            target_msm = msm_df.loc(axis=0)[:,ta]

            msm_fdr_list = []
            for i in range(self.decoy_sample_size):
                sf_da_list = map(tuple, self.td_df[self.td_df.ta == ta][['sf_id', 'da']][i::self.decoy_sample_size].values)
                decoy_msm = msm_df.loc[sf_da_list]
                msm_fdr = self._msm_fdr_map(target_msm, decoy_msm)
                msm_fdr_list.append(msm_fdr)

            msm_fdr_avg = pd.Series(pd.concat(msm_fdr_list, axis=1).median(axis=1), name='fdr')
            target_fdr = self._digitize_fdr(target_msm.join(msm_fdr_avg, on='msm'))
            target_fdr_df_list.append(target_fdr.drop('msm', axis=1))

        return pd.concat(target_fdr_df_list, axis=0)
예제 #24
0
 def create_index(self, name='sm'):
     body = {
         'settings': {
             "index": {
                 'max_result_window': 2147483647,
                 "analysis": {
                     "analyzer": {
                         "analyzer_keyword": {
                             "tokenizer": "keyword",
                             "filter": "lowercase"
                         }
                     }
                 }
             }
         },
         'mappings': {
             "annotation": {
                 "properties": {
                     "db_name": {"type": "string", "index": "not_analyzed"},
                     "ds_name": {"type": "string", "index": "not_analyzed"},
                     "sf": {"type": "string", "index": "not_analyzed"},
                     "comp_names": {
                         "type": "string",
                         "analyzer": "analyzer_keyword",
                     },
                     "comp_ids": {"type": "string", "index": "not_analyzed"},
                     "chaos": {"type": "float", "index": "not_analyzed"},
                     "image_corr": {"type": "float", "index": "not_analyzed"},
                     "pattern_match": {"type": "float", "index": "not_analyzed"},
                     "msm": {"type": "float", "index": "not_analyzed"},
                     "adduct": {"type": "string", "index": "not_analyzed"},
                     "fdr": {"type": "float", "index": "not_analyzed"},
                     "mz": {"type": "string", "index": "not_analyzed"}
                 }
             }
         }
     }
     if not self.ind_client.exists(name):
         out = self.ind_client.create(index=name, body=body)
         logger.info('Index {} created\n{}'.format(name, out))
     else:
         logger.info('Index {} already exists'.format(name))
예제 #25
0
 def create_index(self, name='sm'):
     body = {
         'settings': {
             "index": {
                 'max_result_window': 2147483647,
                 "analysis": {
                     "analyzer": {
                         "analyzer_keyword": {
                             "tokenizer": "keyword",
                             "filter": "lowercase"
                         }
                     }
                 }
             }
         },
         'mappings': {
             "annotation": {
                 "properties": {
                     "db_name": {"type": "string", "index": "not_analyzed"},
                     "ds_name": {"type": "string", "index": "not_analyzed"},
                     "sf": {"type": "string", "index": "not_analyzed"},
                     "comp_names": {
                         "type": "string",
                         "analyzer": "analyzer_keyword",
                     },
                     "comp_ids": {"type": "string", "index": "not_analyzed"},
                     "chaos": {"type": "float", "index": "not_analyzed"},
                     "image_corr": {"type": "float", "index": "not_analyzed"},
                     "pattern_match": {"type": "float", "index": "not_analyzed"},
                     "msm": {"type": "float", "index": "not_analyzed"},
                     "adduct": {"type": "string", "index": "not_analyzed"},
                     "fdr": {"type": "float", "index": "not_analyzed"},
                     "mz": {"type": "string", "index": "not_analyzed"}
                 }
             }
         }
     }
     if not self.ind_client.exists(name):
         out = self.ind_client.create(index=name, body=body)
         logger.info('Index {} created\n{}'.format(name, out))
     else:
         logger.info('Index {} already exists'.format(name))
예제 #26
0
    def __init__(self, job_id, db_id, ds_config, db):
        self.job_id = job_id
        self.db_id = db_id
        self.ppm = ds_config['image_generation']['ppm']
        iso_gen_conf = ds_config['isotope_generation']
        charge = '{}{}'.format(iso_gen_conf['charge']['polarity'], iso_gen_conf['charge']['n_charges'])
        target_sf_peaks_rs = db.select(THEOR_PEAKS_TARGET_ADD_SEL, self.db_id,
                                       iso_gen_conf['adducts'], iso_gen_conf['isocalc_sigma'],
                                       iso_gen_conf['isocalc_pts_per_mz'], charge)
        assert target_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (target)'

        decoy_sf_peaks_rs = db.select(THEOR_PEAKS_DECOY_ADD_SEL, self.job_id, self.db_id,
                                      iso_gen_conf['isocalc_sigma'], iso_gen_conf['isocalc_pts_per_mz'], charge)
        assert decoy_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (decoy)'

        sf_peak_rs = target_sf_peaks_rs + decoy_sf_peaks_rs
        self.sf_df = (pd.DataFrame(sf_peak_rs, columns=['sf_id', 'adduct', 'centr_mzs', 'centr_ints'])
                      .sort_values(['sf_id', 'adduct']))
        self.check_formula_uniqueness(self.sf_df)

        logger.info('Loaded %s sum formula, adduct combinations from the DB', self.sf_df.shape[0])
예제 #27
0
    def __init__(self, job_id, db_id, ds_config, db):
        self.job_id = job_id
        self.db_id = db_id
        self.ppm = ds_config['image_generation']['ppm']
        iso_gen_conf = ds_config['isotope_generation']
        charge = '{}{}'.format(iso_gen_conf['charge']['polarity'], iso_gen_conf['charge']['n_charges'])

        target_sf_peaks_rs = db.select(THEOR_PEAKS_TARGET_ADD_SEL, self.db_id,
                                       iso_gen_conf['adducts'], iso_gen_conf['isocalc_sigma'],
                                       iso_gen_conf['isocalc_pts_per_mz'], charge)
        assert target_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (target)'

        decoy_sf_peaks_rs = db.select(THEOR_PEAKS_DECOY_ADD_SEL, self.job_id, self.db_id,
                                      iso_gen_conf['isocalc_sigma'], iso_gen_conf['isocalc_pts_per_mz'], charge)
        assert decoy_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (decoy)'

        sf_peak_rs = target_sf_peaks_rs + decoy_sf_peaks_rs
        self.sf_ids, self.adducts, self.sf_theor_peaks, self.sf_theor_peak_ints = zip(*sf_peak_rs)
        self.check_formula_uniqueness(self.sf_ids, self.adducts)

        logger.info('Loaded %s sum formulas from the DB', len(self.sf_ids))
예제 #28
0
    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self.sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self.sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self.sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")

        # sconf.set("spark.python.profile", "true")
        self.sc = SparkContext(master=self.sm_config['spark']['master'],
                               conf=sconf,
                               appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))
예제 #29
0
    def generate_theor_peaks(self, sf_adduct_cand):
        """
        Args
        ----
        sf_adduct_cand : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of strings with formatted theoretical peaks data
        """
        logger.info('Generating missing peaks')
        formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks
        db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0]
        n = 10000
        for i in xrange(0, len(sf_adduct_cand), n):
            sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i+n], numSlices=128)
            peak_lines = (sf_adduct_cand_rdd
                          .flatMap(lambda (sf_id, sf, adduct): formatted_iso_peaks(db_id, sf_id, sf, adduct))
                          .collect())
            self._import_theor_peaks_to_db(peak_lines)
예제 #30
0
    def run(self):
        """ Starts peaks generation. Checks all formula peaks saved in the database and
        generates peaks only for new ones"""
        logger.info('Running theoretical peaks generation')

        db_id = self.db.select_one(DB_ID_SEL,
                                   self.ds_config['database']['name'])[0]
        formula_list = self.apply_database_filters(
            self.db.select(AGG_FORMULA_SEL, db_id))

        stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id,
                                          self.isocalc_wrapper.sigma,
                                          self.isocalc_wrapper.charge,
                                          self.isocalc_wrapper.pts_per_mz)

        sf_adduct_cand = self.find_sf_adduct_cand(formula_list,
                                                  set(stored_sf_adduct))
        logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s',
                    len(stored_sf_adduct), len(sf_adduct_cand))

        if sf_adduct_cand:
            self.generate_theor_peaks(sf_adduct_cand)
예제 #31
0
    def save_ds_meta(self):
        """ Save dataset metadata (name, path, image bounds, coordinates) to the database """
        # ds_id_row = self.db.select_one(DS_ID_SELECT, self.name)
        # if not ds_id_row:
        #     logger.info('No dataset with name %s found', self.name)

        # ds_id = self.db.select_one(MAX_DS_ID_SELECT)[0] + 1
        self.db.alter(DS_DEL, self.name)
        img_bounds = json.dumps({
            'x': {
                'min': self.min_x,
                'max': self.max_x
            },
            'y': {
                'min': self.min_y,
                'max': self.max_y
            }
        })
        ds_config_json = json.dumps(self.ds_config)

        owner_rs = self.db.select_one(CLIENT_ID_SEL, self.owner_email)
        if self.owner_email and not owner_rs:
            raise Exception("Could't find a user with email {}".format(
                self.owner_email))

        owner_id = owner_rs[0] if owner_rs else None
        ds_row = [(self.name, owner_id, self.input_path, img_bounds,
                   ds_config_json)]
        self.db.insert(DS_INSERT, ds_row)

        ds_id = self.db.select(DS_ID_SELECT, self.name)[0]
        logger.info("Inserted into the dataset table: %s, %s", ds_id,
                    self.name)

        xs, ys = map(list, zip(*self.coords))
        self.db.insert(COORD_INSERT, [(ds_id, xs, ys)])
        logger.info("Inserted to the coordinates table")
예제 #32
0
    def generate_theor_peaks(self, sf_adduct_cand):
        """
        Args
        ----
        sf_adduct_cand : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of strings with formatted theoretical peaks data
        """
        logger.info('Generating missing peaks')
        formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks
        db_id = self.db.select_one(DB_ID_SEL,
                                   self.ds_config['database']['name'])[0]
        n = 10000
        for i in xrange(0, len(sf_adduct_cand), n):
            sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i + n],
                                                     numSlices=128)
            peak_lines = (sf_adduct_cand_rdd.flatMap(
                lambda (sf_id, sf, adduct): formatted_iso_peaks(
                    db_id, sf_id, sf, adduct)).collect())
            self._import_theor_peaks_to_db(peak_lines)
예제 #33
0
    def convert(self, preprocess=False, print_progress=True):
        """
        Converts MS imaging data provided by given parser to a text-based
        format. Optionally writes the coordinates into a coordinate file.

        Args
        ----
        preprocess : bool
            Apply filter and centroid detection to all spectra before writing (rarely useful)
        print_progress : bool
            Whether or not to print progress information to stdout
        """
        logger.info("ImzML -> Txt conversion...")
        self.preprocess = preprocess

        if not exists(self.txt_path):
            self.txt_file = open(self.txt_path, 'w')
            self.coord_file = open(self.coord_path, 'w') if self.coord_path else None

            self.parser = ImzMLParser(self.imzml_path)

            n_pixels = len(self.parser.coordinates)
            track_progress = get_track_progress(n_pixels, max(n_pixels / 100, 100), print_progress)

            for i, coord in enumerate(self.parser.coordinates):
                x, y = coord[:2]
                self._uniq_coord(x, y)
                self.parse_save_spectrum(i, x, y)
                track_progress(i)

            self.txt_file.close()
            if self.coord_file:
                self.coord_file.close()

            logger.info("Conversion finished successfully")
        else:
            logger.info('File %s already exists', self.txt_path)
예제 #34
0
    def convert(self, preprocess=False, print_progress=True):
        """
        Converts MS imaging data provided by given parser to a text-based
        format. Optionally writes the coordinates into a coordinate file.

        Args
        ----
        preprocess : bool
            Apply filter and centroid detection to all spectra before writing (rarely useful)
        print_progress : bool
            Whether or not to print progress information to stdout
        """
        logger.info("ImzML -> Txt conversion...")
        self.preprocess = preprocess

        if not exists(self.txt_path):
            self.txt_file = open(self.txt_path, 'w')
            self.coord_file = open(self.coord_path, 'w') if self.coord_path else None

            self.parser = ImzMLParser(self.imzml_path)

            n_pixels = len(self.parser.coordinates)
            track_progress = get_track_progress(n_pixels, max(n_pixels / 100, 100), print_progress)

            for i, coord in enumerate(self.parser.coordinates):
                x, y = coord[:2]
                self._uniq_coord(x, y)
                self.parse_save_spectrum(i, x, y)
                track_progress(i)

            self.txt_file.close()
            if self.coord_file:
                self.coord_file.close()

            logger.info("Conversion finished successfully")
        else:
            logger.info('File %s already exists', self.txt_path)
예제 #35
0
 def exists(self, path):
     if exists(split_local_path(path)):
         logger.info('Path %s already exists', path)
         return True
     else:
         return False
예제 #36
0
    def run(self, input_path, ds_config_path, clean=False):
        """ Entry point of the engine. Molecule search is completed in several steps:
         * Copying input data to the engine work dir
         * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data
         * Generation and saving to the database theoretical peaks for all formulas from the molecule database
         * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
         * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        -------
        input_path : string
            Path to the dataset folder with .imzML and .ibd files
        ds_config_path: string
            Path to the dataset config file
        clean : bool
            Clean all interim data files before starting molecule search
        """
        try:
            self.wd_manager = WorkDirManager(self.ds_name)
            if clean:
                self.wd_manager.clean()

            self.wd_manager.copy_input_data(input_path, ds_config_path)

            self._read_ds_config()
            logger.info('Dataset config:\n%s', pformat(self.ds_config))

            self._configure_spark()
            self._init_db()

            if not self.wd_manager.exists(self.wd_manager.txt_path):
                imzml_converter = ImzmlTxtConverter(
                    self.ds_name, self.wd_manager.local_dir.imzml_path,
                    self.wd_manager.local_dir.txt_path,
                    self.wd_manager.local_dir.coord_path)
                imzml_converter.convert()

                if not self.wd_manager.local_fs_only:
                    self.wd_manager.upload_to_remote()

            self.ds = Dataset(self.sc, self.ds_name, self.client_email,
                              input_path, self.ds_config, self.wd_manager,
                              self.db)
            self.ds.save_ds_meta()

            self.store_job_meta()

            theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config,
                                                  self.ds_config)
            theor_peaks_gen.run()

            target_adducts = self.ds_config['isotope_generation']['adducts']
            self.fdr = FDR(self.job_id,
                           self.sf_db_id,
                           decoy_sample_size=20,
                           target_adducts=target_adducts,
                           db=self.db)
            self.fdr.decoy_adduct_selection()
            self.formulas = FormulasSegm(self.job_id, self.sf_db_id,
                                         self.ds_config, self.db)

            # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config)
            search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas,
                                       self.fdr, self.ds_config)
            sf_metrics_df, sf_iso_images = search_alg.search()

            search_results = SearchResults(
                self.sf_db_id, self.ds_id, self.job_id, self.ds_name,
                self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config,
                self.ds_config)
            search_results.sf_metrics_df = sf_metrics_df
            search_results.sf_iso_images = sf_iso_images
            search_results.metrics = search_alg.metrics
            search_results.nrows, search_results.ncols = self.ds.get_dims()
            search_results.store()

            es = ESExporter(self.sm_config)
            es.index_ds(self.db, self.ds_name,
                        self.ds_config['database']['name'])

        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.error('\n'.join(
                traceback.format_exception(exc_type, exc_value,
                                           exc_traceback)))
        finally:
            if self.sc:
                # self.sc.show_profiles()
                self.sc.stop()
            if self.db:
                self.db.close()
예제 #37
0
 def copy(self, local, remote):
     logger.info('Coping DS text files to S3...')
     self.s3transfer.upload_file(local, *split_s3_path(remote))
예제 #38
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Exporting search results into a csv file')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('csv_path', type=str, help='Path for the csv file')
    parser.add_argument('--config',
                        dest='sm_config_path',
                        type=str,
                        help='SM config path')
    parser.set_defaults(
        sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0]
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'],
                           isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, ds_config['database']['name'],
                          args.ds_name, isotope_gen_config['isocalc_sigma'],
                          charge, isotope_gen_config['isocalc_pts_per_mz'])

    header = '\t'.join(['formula_db', 'db_ids', 'sf_name', 'sf', 'adduct']) +'\t' + '\t'.join(metrics) + '\t' + \
             '\t'.join(['fdr', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n'
    with open(args.csv_path, 'w') as f:
        f.write(header)
        f.writelines(['\t'.join(map(str, row)) + '\n' for row in export_rs])
    logger.info('Exported all search results for "%s" dataset into "%s" file',
                args.ds_name, args.csv_path)
예제 #39
0
                        type=str,
                        help='SM config path')
    parser.set_defaults(
        sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config, img_bounds = db.select_one(DS_CONFIG_SEL, args.ds_name)
    nrows, ncols = get_img_dims(img_bounds)
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'],
                           isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, args.ds_name, args.sf)

    export_df = pd.DataFrame(
        export_rs, columns=['sf', 'adduct', 'peak', 'pxl_inds', 'ints'])
    export_df['img_dims'] = [(img_bounds['y']['min'], img_bounds['y']['max'],
                              img_bounds['x']['min'], img_bounds['x']['max'])
                             ] * len(export_df)
    # export_df['img'] = export_df.apply(lambda r: build_matrix(np.array(r['pxl_inds']),
    #                                                           np.array(r['ints']), nrows, ncols), axis=1)
    # export_df.drop(['pxl_inds', 'ints'], axis=1, inplace=True)
    # export_df.to_csv(args.csv_path, index=False)
    # cPickle.dump(export_df, open(args.pkl_path, 'wb'))
    export_df.to_csv(args.pkl_path, index=False)
    logger.info(
        'Exported all images for "%s" sum formula in "%s" dataset into "%s" file',
        args.sf, args.ds_name, args.pkl_path)
예제 #40
0
 def exists(self, path):
     if exists(split_local_path(path)):
         logger.info('Path %s already exists', path)
         return True
     else:
         return False
예제 #41
0
 def _init_db(self):
     logger.info('Connecting to the DB')
     self.db = DB(self.sm_config['db'])
     self.sf_db_id = self.db.select_one(
         DB_ID_SEL, self.ds_config['database']['name'])[0]
예제 #42
0
 def copy(self, local, remote):
     logger.info('Coping DS text files to S3...')
     self.s3transfer.upload_file(local, *split_s3_path(remote))
예제 #43
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='SM process dataset at a remote spark location.')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('input_path', type=str, help='Path to a dataset location')
    parser.add_argument('ds_config_path', type=str, help='Path to a dataset config file')
    parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path')
    parser.add_argument('--no-clean', dest='no_clean', action='store_true', help='do not clean interim files')

    start = time.time()
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)

    fileHandler = FileHandler(filename='logs/{}.log'.format(args.ds_name.replace('/', '_')))
    fileHandler.setLevel(DEBUG)
    fileHandler.setFormatter(Formatter(sm_log_formatters['SM']['format']))
    logger.addHandler(fileHandler)

    logger.debug('Using SM config:\n%s', pformat(SMConfig.get_conf()))

    logger.info("Processing...")

    job = SearchJob(None, args.ds_name)
    job.run(args.input_path, args.ds_config_path, clean=not args.no_clean)

    logger.info("All done!")
    time_spent = time.time() - start
    logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60))
예제 #44
0
 def store(self):
     logger.info('Storing search results to the DB')
     self.clear_old_results()
     self.store_sf_img_metrics()
     self.store_sf_iso_images()
예제 #45
0
class SearchResults(object):
    """ Container for molecule search results

    Args
    ----------
    sf_db_id : int
        Formula database id
    ds_id : int
        Dataset id
    job_id : int
        Search job id
    sf_metrics_df : pandas.Dataframe
    sf_iso_images : pyspark.RDD
        Result images of format ((formula_id, adduct)), list of images)
    sf_adduct_peaksn : list
        List of triples (formula id, adduct, number of theoretical peaks)
    db: engine.db.DB
    sm_config: dict
    """
    def __init__(self, sf_db_id, ds_id, job_id, ds_name, sf_adduct_peaksn, db,
                 sm_config, ds_config):
        self.sf_db_id = sf_db_id
        self.ds_id = ds_id
        self.job_id = job_id
        self.ds_name = ds_name
        self.db = db
        self.sm_config = sm_config
        self.ds_config = ds_config
        self.sf_adduct_peaksn = sf_adduct_peaksn
        self.sf_iso_images = None
        self.sf_metrics_df = None
        self.metrics = None
        self.ncols = None
        self.nrows = None

    def clear_old_results(self):
        """ Clear all previous search results for the dataset from the database """
        logger.info('Clearing old job results')
        self.db.alter(clear_iso_image_sql, self.job_id)
        self.db.alter(clear_iso_image_metrics_sql, self.job_id)

    @staticmethod
    def _metrics_table_row_gen(job_id, db_id, metr_df, sf_adduct_peaksn,
                               metrics):
        for ind, r in metr_df.reset_index().iterrows():
            metr_json = json.dumps(
                OrderedDict([(m, float(r[m])) for m in metrics]))
            peaks_n = sf_adduct_peaksn[ind][2]
            yield (job_id, db_id, r.sf_id, r.adduct, float(r.msm),
                   float(r.fdr), metr_json, peaks_n)

    def store_sf_img_metrics(self):
        """ Store formula image metrics in the database """
        logger.info('Storing iso image metrics')
        rows = list(
            self._metrics_table_row_gen(self.job_id, self.sf_db_id,
                                        self.sf_metrics_df,
                                        self.sf_adduct_peaksn, self.metrics))
        self.db.insert(METRICS_INS, rows)

    def store_sf_iso_images(self):
        """ Store formula images in the database

        Args
        -----------
        nrows : int
            Number of rows in the dataset image
        ncols : int
            Number of columns in the dataset image
        """
        job_id = self.job_id
        sf_db_id = self.sf_db_id
        db_config = self.sm_config['db']
        nrows = self.nrows
        ncols = self.ncols

        def iso_img_row_gen(((sf_id, adduct), img_list)):
            for peak_i, img_sparse in enumerate(img_list):
                img_ints = np.zeros(
                    int(nrows) * int(ncols)
                ) if img_sparse is None else img_sparse.toarray().flatten()
                pixel_inds = np.arange(img_ints.shape[0])
                img_ints_mask = img_ints > 0.001
                if img_ints_mask.sum() > 0:
                    yield (job_id, sf_db_id, sf_id, adduct, peak_i,
                           pixel_inds[img_ints_mask].tolist(),
                           img_ints[img_ints_mask].tolist(), img_ints.min(),
                           img_ints.max())

        def store_iso_img_rows(row_it):
            db = DB(db_config)
            try:
                rows = list(row_it)
                if rows:
                    db.insert(SF_ISO_IMGS_INS, rows)
            finally:
                db.close()

        logger.info('Storing iso images')

        # self.sf_iso_images.flatMap(iso_img_row_gen).coalesce(32).foreachPartition(store_iso_img_rows)
        self.sf_iso_images.flatMap(iso_img_row_gen).foreachPartition(
            store_iso_img_rows)
예제 #46
0
 def delete_index(self, name='sm'):
     out = self.ind_client.delete(name)
     logger.info('Index {} deleted\n{}'.format(name, out))
예제 #47
0
              "JOIN job j ON j.id = m.job_id "
              "JOIN dataset ds ON ds.id = j.ds_id "
              "JOIN theor_peaks tp ON tp.db_id = sf_db.id AND tp.sf_id = m.sf_id AND tp.adduct = m.adduct "
              "WHERE sf_db.name = %s AND ds.name = %s "
              "AND ROUND(sigma::numeric, 6) = %s AND charge = %s AND pts_per_mz = %s")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Exporting search results into a csv file')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('csv_path', type=str, help='Path for the csv file')
    parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path')
    parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0]
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name,
                          isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz'])

    header = ','.join(['formula_db', 'ds_name', 'sf', 'adduct', 'chaos', 'img_corr', 'pat_match',
                       'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n'
    with open(args.csv_path, 'w') as f:
        f.write(header)
        f.writelines([','.join(map(str, row)) + '\n' for row in export_rs])
    logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
예제 #48
0
 def clear_old_results(self):
     """ Clear all previous search results for the dataset from the database """
     logger.info('Clearing old job results')
     self.db.alter(clear_iso_image_sql, self.job_id)
     self.db.alter(clear_iso_image_metrics_sql, self.job_id)
예제 #49
0
    "SELECT f.sf, t.adduct, t.centr_mzs, t.centr_ints "
    "FROM public.agg_formula f, public.theor_peaks t "
    "WHERE t.sf_id = f.id AND f.db_id = 1 AND f.sf = %s AND t.adduct = %s "  # hardcoded to always fetch from HMDB, lazy i know
    "ORDER BY t.adduct;")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Exporting isotopic images')
    parser.add_argument('sf', type=str, help='sum formula')
    parser.add_argument('add', type=str, help='adduct')
    parser.add_argument('pkl_path', type=str, help='Path for the cPickle file')
    parser.add_argument('--config',
                        dest='sm_config_path',
                        type=str,
                        help='SM config path')
    parser.set_defaults(
        sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    export_rs = db.select(EXPORT_SEL, args.sf, args.add)

    export_df = pd.DataFrame(
        export_rs, columns=['sf', 'adduct', 'centr_mzs', 'centr_ints'])

    export_df.to_csv(args.pkl_path, index=False)
    logger.info(
        'Exported the spectra for the "%s" sum formula, "%s" adduct into "%s" file',
        args.sf, args.add, args.pkl_path)
예제 #50
0
                        dest='sm_config_path',
                        type=str,
                        help='SM config path')
    parser.add_argument('--no-clean',
                        dest='no_clean',
                        action='store_true',
                        help='do not clean interim files')

    start = time.time()
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)

    fileHandler = FileHandler(
        filename='logs/{}.log'.format(args.ds_name.replace('/', '_')))
    fileHandler.setLevel(DEBUG)
    fileHandler.setFormatter(Formatter(sm_log_formatters['SM']['format']))
    logger.addHandler(fileHandler)

    logger.debug('Using SM config:\n%s', pformat(SMConfig.get_conf()))

    logger.info("Processing...")

    job = SearchJob(None, args.ds_name)
    job.run(args.input_path, args.ds_config_path, clean=not args.no_clean)

    logger.info("All done!")
    time_spent = time.time() - start
    logger.info('Time spent: %d mins %d secs',
                *divmod(int(round(time_spent)), 60))