Пример #1
0
def test_massive_offline(block_internet, tmp_path):
    """Test massive project offline functionality"""
    ppx.find_project(MSVID)  # Doesn't need internet to resolve
    proj = ppx.find_project(MSVID, repo="mAsSiVe")
    assert isinstance(proj, ppx.MassiveProject)
    assert proj.local == tmp_path / MSVID
    assert not proj.fetch

    proj = ppx.find_project(MSVID, local=(tmp_path / "test"), repo="massive")
    assert proj.local == tmp_path / "test"

    proj = ppx.find_project(MSVID, fetch=True, repo="massive")
    assert proj.fetch
Пример #2
0
def test_pride_offline(block_internet, tmp_path):
    """Test pride project offline functionality"""
    with pytest.raises(Exception):
        ppx.find_project(PXID)

    proj = ppx.find_project(PXID, repo="PrIdE")
    assert isinstance(proj, ppx.PrideProject)
    assert proj.local == tmp_path / PXID
    assert not proj.fetch

    proj = ppx.find_project(PXID, local=(tmp_path / "test"), repo="pride")
    assert proj.local == tmp_path / "test"

    proj = ppx.find_project(PXID, fetch=True, repo="pride")
    assert proj.fetch
Пример #3
0
    def prepare_data(self):
        """Verify that the spectrum indices contain the expected spectra.

        If a mass spectrometry data file is missing from a spectrum index,
        this function will download the missing mzML file and add it.
        """
        index_map = {
            "train": self.train_index,
            "validation": self.valid_index,
            "test": self.test_index,
        }

        for split, projects in self.splits.items():
            if split == "rejected":
                continue

            index = index_map[split]
            for project, ms_files in projects.items():
                for ms_file in ms_files:
                    fname = str(Path(project, ms_file))
                    if any(fname in f for f in index.ms_files):
                        continue

                    proj = ppx.find_project(project)
                    downloaded = proj.download(ms_file)[0]
                    index.add_file(downloaded)
                    if not self.keep_files:
                        downloaded.unlink()  # Delete the file when we're done
Пример #4
0
    def _add_remote_dataset(self, accession, filter_str):
        # Initiate ppx call for accesion validation
        cache_path = os.makedirs(os.path.join(os.getcwd(), ".ppx_cache"),
                                 exist_ok=True)
        project = ppx.find_project(accession, local=cache_path)

        # Check if dataset is already in database
        dataset_query = self.database\
                            .session\
                            .query(Dataset)\
                            .filter(Dataset.accession == accession)
        if self.database.safe_run(dataset_query.count) == 0:
            print("==> {} not in database, adding now".format(accession))
            dataset = Dataset(accession=accession, title=project.title)

        else:
            print("==> {} already present in database, updating files".format(
                accession))
            dataset = self.database.safe_run(dataset_query.one)

        # Check for files in remote
        file_list = project.remote_files("*.raw")
        file_list = [
            f for f in file_list if re.search(filter_str, f) is not None
        ]
        file_list.sort()

        # Add files if not present
        for file_name in file_list:
            sample = Sample(
                parentDataset=dataset.accession,
                sampleName=os.path.splitext(file_name)[0],
                fileType=os.path.splitext(file_name)[1].lstrip(".").lower(),
                fileName=file_name,
                fileLocation="remote")
            sample_query = self.database.session\
                                        .query(Sample)\
                                        .filter(Sample.parentDataset == sample.parentDataset)\
                                        .filter(Sample.sampleName == sample.sampleName)
            if self.database.safe_run(sample_query.count) == 0:
                dataset.samples.append(sample)

        # Update database
        self.database.safe_add(dataset)
Пример #5
0
    def add_projects(self, split, num):
        """Add random projects to a split.

        Parameters
        ----------
        split : str, {"train", "validation", "test"}
            The split to add projects to.
        num : int
            The number of random projects to add.
        """
        added = 0
        pattern = re.compile(r"ccms_peak/.*\.mzml$", flags=re.IGNORECASE)
        for idx, msvid in enumerate(self._projects):
            proj = ppx.find_project(msvid)
            keep = []
            file_info = [l.split(",") for l in proj.file_info().splitlines()]
            ms2_idx = file_info[0].index("spectra_ms2")
            for info in file_info[1:]:
                fname = info[0].split("/", 1)[1]
                if pattern.search(fname) and int(info[ms2_idx]):
                    keep.append(fname)

            if keep and validate(proj, keep[0]):
                self.splits[split][msvid] = keep
                added += 1
                LOGGER.info("Found %i/%i...", added, num)
            else:
                self.splits["rejected"].append(msvid)

            if added == num:
                break

            self.save()

        # Remove the projects we've sampled from consideration:
        del self._projects[:idx + 1]
        if added < num:
            LOGGER.warn("Not enough projects for the request. Added %i", added)

        return added
Пример #6
0
    def test_remote_update(self):
        """Test remote connection to ProteomeXchange"""

        prev_wd = os.getcwd()
        with tempfile.TemporaryDirectory() as temp_path:
            os.chdir(temp_path)
            test_db_path = "sqlite:///" + temp_path + "/phosphopedia.db"

            print()
            accession = "PXD001492"
            manager = managers.DatasetManager(test_db_path)
            manager.add_datasets([accession])

            # Make sure all files added
            project = ppx.find_project(accession, local=temp_path)
            nfiles = len(project.remote_files("*.raw"))
            sample_query = manager.database\
                                  .session\
                                  .query(schema.Sample)\
                                  .order_by(schema.Sample.sampleName)
            sample_entries = manager.database.safe_run(sample_query.all)
            self.assertEqual(len(sample_entries), nfiles)

            os.chdir(prev_wd)
Пример #7
0
def test_timeout():
    """Try a value that is too small."""
    with pytest.raises((ConnectTimeout, ReadTimeout)):
        proj = ppx.find_project(PXID, timeout=0.0000000000001)
Пример #8
0
def test_massive_project_with_pxd():
    proj = ppx.find_project(MSVPXD, timeout=10)
    assert isinstance(proj, ppx.MassiveProject)
    assert proj.id == MSVID
Пример #9
0
def test_massive_project():
    """Test massive project resolution"""
    proj = ppx.find_project(MSVID, timeout=10)
    assert isinstance(proj, ppx.MassiveProject)
Пример #10
0
def test_pride_online():
    """Test pride project resolution"""
    proj = ppx.find_project(PXID, timeout=10)
    assert isinstance(proj, ppx.PrideProject)