예제 #1
0
 def _local_filepath(self, url):
     """ Return the local file path for url.
     """
     rest, basename = posixpath.split(url)
     dirname = posixpath.basename(rest)
     return serverfiles.localpath(
                 "ArrayExpress", os.path.join(dirname, basename))
예제 #2
0
    def fgem_to_table(self):
        """ Retrieve the processed matrix from the Array Express FTP
        server and convert it to a :class:`Orange.data.Table`.

        """
        assert(self.fgemdatafiles)
        repo_dir = serverfiles.localpath("ArrayExpress", self.accession)
        # Find the file listing the data matrix files
        # (should be in sdrf but sometimes it is in 2column file only, why?)
        sdrf = self._search_files("sdrf", "txt")
        if sdrf:
            sdrf = SampleDataRelationship(
                io.TextIOWrapper(self._open(sdrf[0].get("url")),
                                 encoding="utf-8"),
            )
            if "Derived Array Data Matrix File" not in sdrf.header:
                twocol = self._search_files("twocolumn", "txt")
                if twocol:
                    sdrf = SampleDataRelationship(
                        io.TextIOWrapper(self._open(twocol[0].get("url")),
                                         encoding="utf-8")
                    )
        matrix_file = self._search_files("fgem")[0]
        self._open(matrix_file.get("url"))

        idf_file = self._search_files("idf", "txt")[0]
        self._open(idf_file.get("url"))  # To download if not cached
        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
예제 #3
0
    def fgem_to_table(self):
        """ Retrieve the processed matrix from the Array Express FTP
        server and convert it to a :class:`Orange.data.Table`.

        """
        assert (self.fgemdatafiles)
        repo_dir = serverfiles.localpath("ArrayExpress", self.accession)
        # Find the file listing the data matrix files
        # (should be in sdrf but sometimes it is in 2column file only, why?)
        sdrf = self._search_files("sdrf", "txt")
        if sdrf:
            sdrf = SampleDataRelationship(
                io.TextIOWrapper(self._open(sdrf[0].get("url")),
                                 encoding="utf-8"), )
            if "Derived Array Data Matrix File" not in sdrf.header:
                twocol = self._search_files("twocolumn", "txt")
                if twocol:
                    sdrf = SampleDataRelationship(
                        io.TextIOWrapper(self._open(twocol[0].get("url")),
                                         encoding="utf-8"))
        matrix_file = self._search_files("fgem")[0]
        self._open(matrix_file.get("url"))

        idf_file = self._search_files("idf", "txt")[0]
        self._open(idf_file.get("url"))  # To download if not cached
        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
예제 #4
0
    def _download_file(self, url, extract=True):
        """ Download the `file` from the ArrayExpress into a local
        repository directory.

        """
        rest, basename = posixpath.split(url)
        dirname = posixpath.basename(rest)
        repo_dir = serverfiles.localpath("ArrayExpress", dirname)
        try:
            os.makedirs(repo_dir)
        except OSError:
            pass
        stream = urlopen(url)
        local_filename = os.path.join(repo_dir, basename)
        shutil.copyfileobj(stream, open(local_filename, "wb"))

        if extract:
            _, extension = os.path.splitext(local_filename)
            if extension == ".zip":
                import zipfile
                zfile = zipfile.ZipFile(local_filename)
                zfile.extractall(repo_dir)
            elif extension == ".gz":
                import gzip
                gzfile = gzip.open(local_filename)
                gzfile.extractall(repo_dir)
            elif extension in [".tgz"]:
                import tarfile
                tfile = tarfile.TarFile(local_filename)
                tfile.extractall(repo_dir)
            elif extension == ".txt":
                pass
            else:
                raise ValueError("Unknown extension ('{0}').".format(basename))
예제 #5
0
 def _local_filepath(self, url):
     """ Return the local file path for url.
     """
     rest, basename = posixpath.split(url)
     dirname = posixpath.basename(rest)
     return serverfiles.localpath("ArrayExpress",
                                  os.path.join(dirname, basename))
예제 #6
0
    def _download_file(self, url, extract=True):
        """ Download the `file` from the ArrayExpress into a local
        repository directory.

        """
        rest, basename = posixpath.split(url)
        dirname = posixpath.basename(rest)
        repo_dir = serverfiles.localpath("ArrayExpress", dirname)
        try:
            os.makedirs(repo_dir)
        except OSError:
            pass
        stream = urlopen(url)
        local_filename = os.path.join(repo_dir, basename)
        shutil.copyfileobj(stream, open(local_filename, "wb"))

        if extract:
            _, extension = os.path.splitext(local_filename)
            if extension == ".zip":
                import zipfile
                zfile = zipfile.ZipFile(local_filename)
                zfile.extractall(repo_dir)
            elif extension == ".gz":
                import gzip
                gzfile = gzip.open(local_filename)
                gzfile.extractall(repo_dir)
            elif extension in [".tgz"]:
                import tarfile
                tfile = tarfile.TarFile(local_filename)
                tfile.extractall(repo_dir)
            elif extension == ".txt":
                pass
            else:
                raise ValueError("Unknown extension ('{0}').".format(basename))
예제 #7
0
 def updateInfo(self):
     gds_info = self.gds_info
     text = ("%i datasets\n%i datasets cached\n" %
             (len(gds_info),
              len(glob.glob(serverfiles.localpath("GEO") + "/GDS*"))))
     filtered = self.treeWidget.model().rowCount()
     if len(self.gds) != filtered:
         text += ("%i after filtering") % filtered
     self.infoBox.setText(text)
예제 #8
0
 def updateInfo(self):
     gds_info = self.gds_info
     text = ("%i datasets\n%i datasets cached\n" %
             (len(gds_info),
              len(glob.glob(serverfiles.localpath("GEO") + "/GDS*"))))
     filtered = self.treeWidget.model().rowCount()
     if len(self.gds) != filtered:
         text += ("%i after filtering") % filtered
     self.infoBox.setText(text)
예제 #9
0
    def ParseTaxdumpFile(file=None, outputdir=None, callback=None):
        import Orange.utils
        if file == None:
            so = StringIO()
            Orange.utils.wget("ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz", dst_obj=so)
            file = tarfile.open(None, "r:gz", StringIO(so.getvalue()))
            so.close()
        elif type(file) == str:
            file = tarfile.open(file)
        names = file.extractfile("names.dmp").readlines()
        nodes = file.extractfile("nodes.dmp").readlines()
        namesDict = defaultdict(list)
        for line in names:
            if not line.strip():
                continue
            line = line.rstrip("\t\n|").split("\t|\t")
            id, name, unique_name, name_class = line
            if unique_name:
                namesDict[id].append((unique_name , name_class))
            else:
                namesDict[id].append((name , name_class))

        nodesDict = {}
        for line in nodes:
            if not line.strip():
                continue
            line = line.split("\t|\t")[:3]
            id, parent, rank = line
            nodesDict[id] = (parent, rank)
        
        if outputdir == None:
            outputdir = serverfiles.localpath("Taxonomy")
        text = TextDB().create(os.path.join(outputdir, "ncbi_taxonomy.db"))
        info = TextDB().create(os.path.join(outputdir, "ncbi_taxonomy_inf.db"))
        milestones = set(range(0, len(namesDict), max(int(len(namesDict)/100), 1)))
        for i, (id, names) in enumerate(namesDict.items()):
            parent, rank = nodesDict[id]
            ## id, parent and rank go first
            entry = [id, parent, rank]
            ## all names and name class codes pairs follow ordered so scientific name is first
            names = sorted(names, key=lambda x: (not x[1] == "scientific name", x[1], x[0]))
            print(names)
            entry.extend([name for name ,class_ in names])
            info_entry = [id] + [class_ for name, class_ in names]
            text(entry)
            info(info_entry)
            if callback and i in milestones:
                callback(i)
예제 #10
0
    def _updateToolTip(self):
        state_str = self.STATE_STRINGS[self.item.state]
        try:
            diff_date = self.item.latest - self.item.local
        except:
            diff_date = None

        tooltip = ("State: %s\nTags: %s" %
                   (state_str, ", ".join(tag for tag in self.item.tags
                    if not tag.startswith("#"))))

        if self.item.state in [CURRENT, OUTDATED, DEPRECATED]:
            tooltip += ("\nFile: %s" %
                        serverfiles.localpath(self.item.domain,
                                              self.item.filename))

        if self.item.state == OUTDATED and diff_date:
            tooltip += ("\nServer version: %s\nStatus: old (%d days)" % (self.item.latest, diff_date.days))
        else:
            tooltip += ("\nServer version: %s" % self.item.latest)

        for i in range(1, 4):
            self.setToolTip(i, tooltip)
예제 #11
0
    def _updateToolTip(self):
        state_str = self.STATE_STRINGS[self.item.state]
        try:
            diff_date = self.item.latest - self.item.local
        except:
            diff_date = None

        tooltip = ("State: %s\nTags: %s" %
                   (state_str, ", ".join(tag for tag in self.item.tags
                    if not tag.startswith("#"))))

        if self.item.state in [CURRENT, OUTDATED, DEPRECATED]:
            tooltip += ("\nFile: %s" %
                        serverfiles.localpath(self.item.domain,
                                              self.item.filename))

        if self.item.state == OUTDATED and diff_date:
            tooltip += ("\nServer version: %s\nStatus: old (%d days)" % (self.item.latest, diff_date.days))
        else:
            tooltip += ("\nServer version: %s" % self.item.latest)

        for i in range(1, 4):
            self.setToolTip(i, tooltip)
예제 #12
0
def get_gds_model(progress=lambda val: None):
    """
    Initialize and return a GDS datasets model.

    :param progress: A progress callback.
    :rval tuple:
        A tuple of (QStandardItemModel, geo.GDSInfo, [geo.GDS])

    .. note::
        The returned QStandardItemModel's thread affinity is set to
        the GUI thread.

    """
    progress(1)
    info = geo.GDSInfo()
    search_keys = ["dataset_id", "title", "platform_organism", "description"]
    cache_dir = serverfiles.localpath(geo.DOMAIN)
    gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}"
    pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}"
    gds_list = []

    def is_cached(gds):
        return os.path.exists(
            os.path.join(cache_dir, gds["dataset_id"]) + ".soft.gz")

    def item(displayvalue, item_values={}):
        item = QStandardItem()
        item.setData(displayvalue, Qt.DisplayRole)
        for role, value in item_values.items():
            item.setData(value, role)
        return item

    def gds_to_row(gds):
        #: Text for easier full search.
        search_text = " | ".join(
            [gds.get(key, "").lower() for key in search_keys])
        row = [
            item(" " if is_cached(gds) else "", {TextFilterRole: search_text}),
            item(gds["dataset_id"],
                 {LinkRole: gds_link.format(gds["dataset_id"])}),
            item(gds["title"]),
            item(gds["platform_organism"]),
            item(len(gds["samples"])),
            item(gds["feature_count"]),
            item(gds["gene_count"]),
            item(len(gds["subsets"])),
            item(
                gds.get("pubmed_id", ""), {
                    LinkRole:
                    pm_link.format(gds["pubmed_id"])
                    if gds.get("pubmed_id") else None
                })
        ]
        return row

    model = QStandardItemModel()
    model.setHorizontalHeaderLabels([
        "", "ID", "Title", "Organism", "Samples", "Features", "Genes",
        "Subsets", "PubMedID"
    ])
    progress(20)
    for gds in info.values():
        model.appendRow(gds_to_row(gds))

        gds_list.append(gds)

    progress(50)

    if QThread.currentThread() is not QCoreApplication.instance().thread():
        model.moveToThread(QCoreApplication.instance().thread())
    return model, info, gds_list
예제 #13
0
        "Subsets", "PubMedID"
    ])
    progress(20)
    for gds in info.values():
        model.appendRow(gds_to_row(gds))

        gds_list.append(gds)

    progress(50)

    if QThread.currentThread() is not QCoreApplication.instance().thread():
        model.moveToThread(QCoreApplication.instance().thread())
    return model, info, gds_list


GDS_CACHE_DIR = serverfiles.localpath(geo.DOMAIN)

if sys.version_info >= (3, 4):
    _os_replace = os.replace
else:
    if os.name != "posix":

        def _os_replace(src, dst):
            try:
                os.rename(src, dst)
            except FileExistsError:
                os.remove(dst)
                os.rename(src)
    else:
        _os_replace = os.rename
예제 #14
0
def get_gds_model(progress=lambda val: None):
    """
    Initialize and return a GDS datasets model.

    :param progress: A progress callback.
    :rval tuple:
        A tuple of (QStandardItemModel, geo.GDSInfo, [geo.GDS])

    .. note::
        The returned QStandardItemModel's thread affinity is set to
        the GUI thread.

    """
    progress(1)
    info = geo.GDSInfo()
    search_keys = ["dataset_id", "title", "platform_organism", "description"]
    cache_dir = serverfiles.localpath(geo.DOMAIN)
    gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}"
    pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}"
    gds_list = []

    def is_cached(gds):
        return os.path.exists(os.path.join(cache_dir, gds["dataset_id"]) +
                              ".soft.gz")

    def item(displayvalue, item_values={}):
        item = QStandardItem()
        item.setData(displayvalue, Qt.DisplayRole)
        for role, value in item_values.items():
            item.setData(value, role)
        return item

    def gds_to_row(gds):
        #: Text for easier full search.
        search_text = " | ".join([gds.get(key, "").lower()
                                  for key in search_keys])
        row = [
            item(" " if is_cached(gds) else "",
                 {TextFilterRole: search_text}),
            item(gds["dataset_id"],
                 {LinkRole: gds_link.format(gds["dataset_id"])}),
            item(gds["title"]),
            item(gds["platform_organism"]),
            item(len(gds["samples"])),
            item(gds["feature_count"]),
            item(gds["gene_count"]),
            item(len(gds["subsets"])),
            item(gds.get("pubmed_id", ""),
                 {LinkRole: pm_link.format(gds["pubmed_id"])
                            if gds.get("pubmed_id")
                            else None})
        ]
        return row

    model = QStandardItemModel()
    model.setHorizontalHeaderLabels(
        ["", "ID", "Title", "Organism", "Samples", "Features",
         "Genes", "Subsets", "PubMedID"]
    )
    progress(20)
    for gds in info.values():
        model.appendRow(gds_to_row(gds))

        gds_list.append(gds)

    progress(50)

    if QThread.currentThread() is not QCoreApplication.instance().thread():
        model.moveToThread(QCoreApplication.instance().thread())
    return model, info, gds_list
예제 #15
0
         "Genes", "Subsets", "PubMedID"]
    )
    progress(20)
    for gds in info.values():
        model.appendRow(gds_to_row(gds))

        gds_list.append(gds)

    progress(50)

    if QThread.currentThread() is not QCoreApplication.instance().thread():
        model.moveToThread(QCoreApplication.instance().thread())
    return model, info, gds_list


GDS_CACHE_DIR = serverfiles.localpath(geo.DOMAIN)


if sys.version_info >= (3, 4):
    _os_replace = os.replace
else:
    if os.name != "posix":
        def _os_replace(src, dst):
            try:
                os.rename(src, dst)
            except FileExistsError:
                os.remove(dst)
                os.rename(src)
    else:
        _os_replace = os.rename
예제 #16
0
class ArrayExpressConnection(object):
    """
    Constructs and runs REST query on ArrayExpress.

    :param address: Address of the ArrayExpress API.
    :param timeout: Timeout for the connection.

    """

    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
    DEFAULT_FORMAT = "json"
    DEFAULT_CACHE = serverfiles.localpath("ArrayExpress",
                                          "ArrayExpressCache.shelve")

    # Order of arguments in the query
    _ARGS_ORDER = ["keywords", "species", "array"]

    def __init__(self,
                 address=None,
                 timeout=30,
                 cache=None,
                 username=None,
                 password=None):
        self.address = address if address is not None else self.DEFAULT_ADDRESS
        self.timeout = timeout
        self.cache = cache if cache is not None else self.DEFAULT_CACHE
        self.username = username
        self.password = password

    def format_query(self, **kwargs):
        """Format the query arguments in `kwargs`.

        >>> conn.format_query(gxa=True, efcount=(1, 5))
        'efcount=[1 TO 5]&gxa=true'

        """

        # Formaters:
        def format_default(val):
            if isinstance(val, six.string_types):
                return val
            else:
                return "+".join(val)

        def format_species(val):
            return '"%s"' % val.lower()

        def format_gxa(val):
            if val:
                return "true"
            else:
                raise ValueError("gxa={0}".format(val))

        def format_expandefo(val):
            if val:
                return "on"
            else:
                raise ValueError("expandefo={0}".format(val))

        def format_true_false(val):
            return "true" if val else "false"

        def format_interval(val):
            if isinstance(val, tuple):
                return "[{0} TO {1}]".format(*val)
            else:
                raise ValueError("Must be an interval argument (min, max)!")

        def format_date(val):
            # TODO check if val contains a datetime.date object
            # assert proper format
            return format_interval(val)

        def format_wholewords(val):
            if val:
                return "on"
            else:
                raise ValueError("wholewords={0}".format(val))

        formaters = {
            "species": format_species,
            "gxa": format_gxa,
            "expandefo": format_expandefo,
            "directsub": format_true_false,
            "assaycount": format_interval,
            "efcount": format_interval,
            "samplecount": format_interval,
            "sacount": format_interval,
            "rawcount": format_interval,
            "fgemcount": format_interval,
            "miamescore": format_interval,
            "date": format_date,
            "wholewords": format_wholewords,
        }
        parts = []
        arg_items = sorted(kwargs.items())
        arg_items = sorted(arg_items,
                           key=lambda arg: self._ARGS_ORDER.index(arg[0])
                           if arg[0] in self._ARGS_ORDER else 100)

        for key, value in arg_items:
            if key == "format":
                continue  # format is handled in query_url
            if key not in ARRAYEXPRESS_FIELDS:
                raise ValueError("Invalid argument name: '{0}'".format(key))
            if value is not None and value != []:
                fmt = formaters.get(key, format_default)
                value = fmt(value)
                parts.append("{0}={1}".format(key, value))

        return "&".join(parts)

    def query_url(self, what="experiments", **kwargs):
        """Return a formatted query URL for the query arguments.

        >>> conn.query_url(accession="E-MEXP-31")
        'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'

        """
        query = self.format_query(**kwargs)
        url = posixpath.join(self.address, what)
        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
        url = url + ("?" + query if query else "")
        url = url.replace(" ", "%20")
        return url

    def query_url_experiments(self, **kwargs):
        """Return query URL of formatted experiments for the query arguments.
        """
        return self.query_url("experiments", **kwargs)

    def query_url_files(self, **kwargs):
        """ Return query URL of formatted experiments for the query arguments.
        """
        return self.query_url("files", **kwargs)

    def query_experiment(self, **kwargs):
        """Return an open stream to the experiments query results. 
           Takes the same arguments as the :obj:`query_experiments` function.
        """
        url = self.query_url_experiments(**kwargs)
        stream = self._cache_urlopen(url, timeout=self.timeout)
        return stream

    def query_files(self, **kwargs):
        """Return an open stream to the files query results.
           Takes the same arguments as the :obj:`query_files` function.
        """
        url = self.query_url_files(**kwargs)
        stream = self._cache_urlopen(url, timeout=self.timeout)
        return stream

    def open_file(self, accession, kind="raw", ext=None):
        """ Return a file handle to experiment data.
        
        :param str accession:
        :param str kind: Experiment data type.
        
        Possible values for the parameter `kind`:
            - raw: return the raw data if available
            - processed: return the processed data if available
            - biosamples: a png or svg design image
            - idf: investigation description
            - adf: array design description
            - mageml: MAGE-ML file

        Example::

            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")  # doctest: +SKIP
            >>> processed_file = conn.open_file("E-TABM-1087", kind="processed")  # doctest: +SKIP

        """
        stream = self.query_files(accession=accession, format="json")
        data = json.load(io.TextIOWrapper(stream, encoding="utf-8"))
        try:
            files = data["files"]["experiment"]["file"]
        except KeyError:
            raise ValueError(accession)

        for file in files:
            filekind = file["kind"]
            fileext = file["extension"]
            if (filekind == kind) and (fileext == ext or ext is None):
                url = file["url"]
                return self._cache_urlopen(str(url), timeout=self.timeout)

        raise ValueError("%s does not have a file of kind: %r" %
                         (accession, kind))

    def _cache_urlopen(self, url, timeout=30):
        if self.cache is not None:
            with self.open_cache("r") as cache:
                if url in cache:
                    return io.BytesIO(cache[url])

            stream = urlopen(url, timeout=timeout)
            data = stream.read()
            with self.open_cache("w") as cache:
                cache[url] = data

            return io.BytesIO(data)
        else:
            return urlopen(url, timeout=timeout)

    def open_cache(self, flag="r"):
        if isinstance(self.cache, six.string_types):
            try:
                return closing(_open_shelve(self.cache, flag))
            except Exception:
                return _fake_closing({})
        elif hasattr(self.cache, "close"):
            return closing(self.cache)
        elif self.cache is None:
            return _fake_closing({})
        else:
            return _fake_closing(self.cache)