示例#1
0
def addIGVSamples(fields, results_samp, annot_samples=None):
    """creates phenotype file for IGV
    :param samples: Solr results for samples to be included
    :type samples: Array.
    :param annot_samples: includes annotation files included with solr results
    :type annot_samples: Array
    """
    # creates human readable indexes of fields to iterate over
    fields_dict = {}
    for i in fields:
        find_index = i.find("_Characteristics_")
        if find_index > -1:
            new_key = i.split("_Characteristics_")[0]
            fields_dict[i] = new_key

    # Creating temp file to enter into file_store
    tempsampname = tempfile.NamedTemporaryFile(delete=False)

    # writing header to sample file
    tempsampname.write("#sampleTable" + "\n")

    # writing column names to sample file
    col_names = "Linking_id"
    for k, v in fields_dict.iteritems():
        col_names = col_names + '\t' + v
    tempsampname.write(col_names + "\n")

    # iterating over sample files
    pheno_results = get_sample_lines(fields_dict, results_samp)
    tempsampname.write(pheno_results)

    # if annotations are not null
    if annot_samples:
        pheno_annot = get_sample_lines(fields_dict, annot_samples)
        tempsampname.write(pheno_annot)

    # closing temp file
    tempsampname.close()

    # getting file_store_uuid
    filestore_uuid = create(tempsampname.name, permanent=True, filetype="txt")
    filestore_item = import_file(filestore_uuid, permanent=True, refresh=True)

    # file to rename
    temp_file = filestore_item.datafile.name.split('/')
    temp_file = temp_file[len(temp_file) - 1] + '.txt'

    # rename file by way of file_store
    filestore_item = rename(filestore_uuid, temp_file)

    # getting file information based on file_uuids
    curr_fs = FileStoreItem.objects.get(uuid=filestore_uuid)

    # full path to selected UUID File
    curr_url = get_full_url(curr_fs.get_datafile_url())

    # delete temp file
    os.unlink(tempsampname.name)

    return curr_url
示例#2
0
def copy_file(orig_uuid):
    """Helper function that copies a file if given the original file's UUID
    :param orig_uuid: UUID of file to copy.
    :type orig_uuid: str.
    :returns: UUID of newly copied file.
    """
    orig_fsi = read(orig_uuid)
    newfile_uuid = None
    try:
        newfile_uuid = create(orig_fsi.source, orig_fsi.sharename, orig_fsi.filetype, permanent=is_permanent(orig_uuid))
        import_file(newfile_uuid, refresh=True)
    except AttributeError:
        pass

    return newfile_uuid
示例#3
0
def copy_file(orig_uuid):
    """Helper function that copies a file if given the original file's UUID
    :param orig_uuid: UUID of file to copy.
    :type orig_uuid: str.
    :returns: UUID of newly copied file.
    """
    orig_fsi = read(orig_uuid)
    newfile_uuid = None
    try:
        newfile_uuid = create(
            orig_fsi.source, orig_fsi.sharename, orig_fsi.filetype,
            permanent=is_permanent(orig_uuid)
        )
        import_file(newfile_uuid, refresh=True)
    except AttributeError:
        pass

    return newfile_uuid
示例#4
0
def copy_file(orig_uuid):
    """Helper function that copies a file if given the original file's UUID
    :param orig_uuid: UUID of file to copy.
    :type orig_uuid: str.
    :returns: UUID of newly copied file.
    """
    newfile_uuid = None
    try:
        orig_fsi = FileStoreItem.objects.get(uuid=orig_uuid)
    except (FileStoreItem.DoesNotExist,
            FileStoreItem.MultipleObjectsReturned) as e:
        logger.error("Couldn't properly fetch FileStoreItem: %s", e)
    else:
        try:
            newfile_uuid = create(orig_fsi.source,
                                  orig_fsi.sharename,
                                  orig_fsi.filetype,
                                  permanent=is_permanent(orig_uuid))
            import_file(newfile_uuid, refresh=True)
        except AttributeError:
            pass

    return newfile_uuid
示例#5
0
def _get_galaxy_download_tasks(analysis):
    """Get file import tasks for Galaxy analysis results"""
    logger.debug("Preparing to download analysis results from Galaxy")
    task_list = []

    # retrieving list of files to download for workflow
    dl_files = analysis.workflow_dl_files
    # creating dictionary based on files to download predetermined by workflow
    # w/ keep operators
    dl_dict = {}
    for dl in dl_files.all():
        temp_dict = {}
        temp_dict['filename'] = dl.filename
        temp_dict['pair_id'] = dl.pair_id
        dl_dict[str(dl.step_id)] = temp_dict
    galaxy_instance = analysis.workflow.workflow_engine.instance

    try:
        download_list = galaxy_instance.get_history_file_list(
            analysis.history_id)
    except galaxy.client.ConnectionError as exc:
        error_msg = (
            "Error downloading Galaxy history files for analysis '%s': %s")
        logger.error(error_msg, analysis.name, exc.message)
        analysis.set_status(Analysis.FAILURE_STATUS, error_msg)
        analysis.galaxy_cleanup()
        return task_list
    # Iterating through files in current galaxy history
    for results in download_list:
        # download file if result state is "ok"
        if results['state'] == 'ok':
            file_type = results["type"]
            curr_file_id = results['name']
            if curr_file_id in dl_dict:
                curr_dl_dict = dl_dict[curr_file_id]
                result_name = curr_dl_dict['filename'] + '.' + file_type
                # size of file defined by galaxy
                file_size = results['file_size']
                # Determining tag if galaxy results should be download through
                # http or copying files directly to retrieve HTML files as zip
                # archives via dataset URL
                if galaxy_instance.local_download and file_type != 'html':
                    download_url = results['file_name']
                else:
                    download_url = urlparse.urljoin(
                        galaxy_instance.base_url, '/'.join([
                            'datasets',
                            str(results['dataset_id']), 'display?to_ext=txt'
                        ]))
                # workaround to set the correct file type for zip archives of
                # FastQC HTML reports produced by Galaxy dynamically
                if file_type == 'html':
                    file_type = 'zip'
                # TODO: when changing permanent=True, fix update of % download
                # of file
                filestore_uuid = create(source=download_url,
                                        filetype=file_type)
                # adding history files to django model
                temp_file = AnalysisResult(analysis_uuid=analysis.uuid,
                                           file_store_uuid=filestore_uuid,
                                           file_name=result_name,
                                           file_type=file_type)
                temp_file.save()
                analysis.results.add(temp_file)
                analysis.save()
                # downloading analysis results into file_store
                # only download files if size is greater than 1
                if file_size > 0:
                    task_id = import_file.subtask(
                        (filestore_uuid, False, file_size))
                    task_list.append(task_id)

    return task_list
示例#6
0
def create_igv_session(genome, uuids, is_file_uuid=False):
    """ Creates session file for selected file uuids, returns newly created
    filestore uuid
    :param is_file_uuid:
    :param genome: Genome to be used in session file i.e. hg18, dm3
    :type genome: string.
    :param uuids: Array of UUIDs to be used
    :type uuids: array.
    :param uuids: Host URL i.e. 127.0.0.1:8000
    :type uuids: string
    """
    # Create IGV Session file and put into Filestore
    """
    http://www.postneo.com/projects/pyxml/

    <?xml version="1.0" encoding="UTF-8"?>
        <Global genome="hg18" locus="EGFR" version="3">
        <Resources>
            <Resource name="RNA Genes"
            path="http://www.broadinstitute.org/igvdata/tcga/gbm/GBM_batch1-8_level3_exp.txt.recentered.080820.gct.tdf"/>
            <Resource name="RNA Genes"
            path="http://www.broadinstitute.org/igvdata/annotations/hg18/rna_genes.bed"/>
            <Resource name="sno/miRNA"
            path="http://www.broadinstitute.org/igvdata/tcga/gbm/Sample_info.txt"/>
        </Resources>
    </Global>
    """
    logger.debug("visualization_manager.create_igv_session called")

    # Create the minidom document
    doc = Document()
    # Create the <wml> base element
    xml = doc.createElement("Global")
    xml.setAttribute("genome", genome)
    xml.setAttribute("locus", "All")
    xml.setAttribute("version", "4")
    doc.appendChild(xml)
    # Add Resources
    xml_resources = doc.createElement("Resources")
    xml.appendChild(xml_resources)
    # get paths to url
    for samp in uuids:
        # gets filestore item
        curr_name, curr_url = get_file_name(samp, is_file_uuid=is_file_uuid)

        logger.debug('New resource: ' + curr_name + ' - ' + curr_url)

        # What to do if fs does not exist?
        if curr_name:
            # creates Resource element
            res = doc.createElement("Resource")
            res.setAttribute("name", curr_name)
            res.setAttribute("path", curr_url)
            xml_resources.appendChild(res)
    # Creating temp file to enter into file_store
    tempfilename = tempfile.NamedTemporaryFile(delete=False)
    tempfilename.write(doc.toprettyxml(indent="  "))
    tempfilename.close()
    # getting file_store_uuid
    filestore_uuid = create(tempfilename.name, filetype="xml")
    filestore_item = import_file(filestore_uuid, refresh=True)
    # file to rename
    temp_name = filestore_item.datafile.name.split('/')
    temp_name = temp_name[len(temp_name) - 1] + '.xml'
    # rename file by way of file_store
    filestore_item = rename(filestore_uuid, temp_name)
    # delete temp file
    os.unlink(tempfilename.name)
    # Url for session file
    fs_url = get_full_url(filestore_item.get_datafile_url())
    # IGV url for automatic launch of Java Webstart
    igv_url = "http://www.broadinstitute.org/igv/projects/current/igv.php" \
              "?sessionURL=" + fs_url
    return igv_url
示例#7
0
def add_igv_samples(fields, results_samp, annot_samples=None):
    """creates phenotype file for IGV
    :param samples: Solr results for samples to be included
    :type samples: Array.
    :param annot_samples: includes annotation files included with solr results
    :type annot_samples: Array
    """
    # creates human readable indexes of fields to iterate over
    fields_dict = {}
    for i in fields:
        find_index = i.find("_Characteristics_")
        if find_index > -1:
            new_key = i.split("_Characteristics_")[0]
            fields_dict[i] = new_key

    # Creating temp file to enter into file_store
    temp_sample_name = tempfile.NamedTemporaryFile(delete=False)

    # writing header to sample file
    temp_sample_name.write("#sampleTable" + "\n")

    # writing column names to sample file
    col_names = "Linking_id"
    for k, v in fields_dict.iteritems():
        col_names = col_names + '\t' + v
    temp_sample_name.write(col_names + "\n")

    # iterating over sample files
    pheno_results = get_sample_lines(fields_dict, results_samp)
    try:
        temp_sample_name.write(pheno_results)
    except UnicodeEncodeError as e:
        logger.error("Could not write results to file: %s. "
                     "Trying again with the content to write encoded "
                     "properly." % e)
        temp_sample_name.write(pheno_results.encode("utf-8"))

    # if annotations are not null
    if annot_samples:
        pheno_annot = get_sample_lines(fields_dict, annot_samples)
        temp_sample_name.write(pheno_annot)

    # closing temp file
    temp_sample_name.close()

    # getting file_store_uuid
    filestore_uuid = create(temp_sample_name.name, filetype="txt")
    filestore_item = import_file(filestore_uuid, refresh=True)

    # file to rename
    temp_file = filestore_item.datafile.name.split('/')
    temp_file = temp_file[len(temp_file) - 1] + '.txt'

    # rename file by way of file_store
    filestore_item = rename(filestore_uuid, temp_file)

    # getting file information based on file_uuids
    curr_fs = FileStoreItem.objects.get(uuid=filestore_uuid)

    # full path to selected UUID File
    curr_url = get_full_url(curr_fs.get_datafile_url())

    # delete temp file
    os.unlink(temp_sample_name.name)

    return curr_url
示例#8
0
def create_igv_session_annot(genome, uuids, annot_uuids=None, samp_file=None):
    """Creates session file for selected file uuids, returns newly created
    filestore uuid
    :param genome: Genome to be used in session file i.e. hg18, dm3
    :type genome: string.
    :param uuids: Array of UUIDs to be used
    :type uuids: array.
    :param uuids: Host URL i.e. 127.0.0.1:8000
    :type uuids: string
    """
    # Create IGV Session file and put into Filestore
    """
    http://www.postneo.com/projects/pyxml/

    <?xml version="1.0" encoding="UTF-8"?>
        <Global genome="hg18" locus="EGFR" version="3">
        <Resources>
            <Resource name="RNA Genes"
            path="http://www.broadinstitute.org/igvdata/tcga/gbm/GBM_batch1-8_level3_exp.txt.recentered.080820.gct.tdf"/>
            <Resource name="RNA Genes"
            path="http://www.broadinstitute.org/igvdata/annotations/hg18/rna_genes.bed"/>
            <Resource name="sno/miRNA"
            path="http://www.broadinstitute.org/igvdata/tcga/gbm/Sample_info.txt"/>
        </Resources>
    </Global>
    """
    # Create the minidom document
    doc = Document()
    # Create the <wml> base element
    xml = doc.createElement("Global")
    xml.setAttribute("genome", genome)
    xml.setAttribute("locus", "All")
    xml.setAttribute("version", "4")
    doc.appendChild(xml)
    # Add Resources
    xml_resources = doc.createElement("Resources")
    xml.appendChild(xml_resources)
    # adding selected samples to xml file
    add_igv_resource(uuids["node_uuid"], xml_resources, doc)
    if annot_uuids:
        # adding selected samples to xml file
        add_igv_resource(annot_uuids["node_uuid"], xml_resources, doc)
    # adds sample information file to IGV session file
    if samp_file:
        # <Resource name="Sample Information"
        # path="http://igv.broadinstitute.org/data/hg18/tcga/gbm/gbmsubtypes/sampleTable.txt.gz"/>
        # creates Resource element
        res = doc.createElement("Resource")
        res.setAttribute("name", "Sample Information")
        res.setAttribute("path", samp_file)
        xml_resources.appendChild(res)
    # <HiddenAttributes>
    #    <Attribute name="DATA FILE"/>
    #    <Attribute name="Linking_id"/>
    #    <Attribute name="DATA TYPE"/>
    # </HiddenAttributes>
    # Adding parameters to hide basic unnecessary sample info
    hidden_attr = doc.createElement("HiddenAttributes")
    xml.appendChild(hidden_attr)

    attr = doc.createElement("Attribute")
    attr.setAttribute("name", "DATA FILE")
    hidden_attr.appendChild(attr)

    attr = doc.createElement("Attribute")
    attr.setAttribute("name", "Linking_id")
    hidden_attr.appendChild(attr)

    attr = doc.createElement("Attribute")
    attr.setAttribute("name", "DATA TYPE")
    hidden_attr.appendChild(attr)

    # Creating temp file to enter into file_store
    tempfilename = tempfile.NamedTemporaryFile(delete=False)
    tempfilename.write(doc.toprettyxml(indent="  "))
    tempfilename.close()

    # getting file_store_uuid
    filestore_uuid = create(tempfilename.name, filetype="xml")
    filestore_item = import_file(filestore_uuid, refresh=True)

    # file to rename
    temp_name = filestore_item.datafile.name.split('/')
    temp_name = temp_name[len(temp_name) - 1] + '.xml'

    # rename file by way of file_store
    filestore_item = rename(filestore_uuid, temp_name)

    # delete temp file
    os.unlink(tempfilename.name)

    # Url for session file
    sessionfile_url = get_full_url(filestore_item.get_datafile_url())

    # IGV url for automatic launch of Java Webstart
    igv_url = "http://www.broadinstitute.org/igv/projects/current/igv.php" \
              "?sessionURL=" + sessionfile_url

    return igv_url
示例#9
0
def add_igv_samples(fields, results_samp, annot_samples=None):
    """creates phenotype file for IGV
    :param samples: Solr results for samples to be included
    :type samples: Array.
    :param annot_samples: includes annotation files included with solr results
    :type annot_samples: Array
    """
    # creates human readable indexes of fields to iterate over
    fields_dict = {}
    for i in fields:
        find_index = i.find("_Characteristics_")
        if find_index > -1:
            new_key = i.split("_Characteristics_")[0]
            fields_dict[i] = new_key

    # Creating temp file to enter into file_store
    temp_sample_name = tempfile.NamedTemporaryFile(delete=False)

    # writing header to sample file
    temp_sample_name.write("#sampleTable" + "\n")

    # writing column names to sample file
    col_names = "Linking_id"
    for k, v in fields_dict.iteritems():
        col_names = col_names + "\t" + v
    temp_sample_name.write(col_names + "\n")

    # iterating over sample files
    pheno_results = get_sample_lines(fields_dict, results_samp)
    try:
        temp_sample_name.write(pheno_results)
    except UnicodeEncodeError as e:
        logger.error(
            "Could not write results to file: %s. " "Trying again with the content to write encoded " "properly.", e
        )
        temp_sample_name.write(pheno_results.encode("utf-8"))

    # if annotations are not null
    if annot_samples:
        pheno_annot = get_sample_lines(fields_dict, annot_samples)
        temp_sample_name.write(pheno_annot)

    # closing temp file
    temp_sample_name.close()

    # getting file_store_uuid
    filestore_uuid = create(temp_sample_name.name, filetype="txt")
    filestore_item = import_file(filestore_uuid, refresh=True)

    # file to rename
    temp_file = filestore_item.datafile.name.split("/")
    temp_file = temp_file[len(temp_file) - 1] + ".txt"

    # rename file by way of file_store
    filestore_item = rename(filestore_uuid, temp_file)

    # getting file information based on file_uuids
    curr_fs = FileStoreItem.objects.get(uuid=filestore_uuid)

    # full path to selected UUID File
    curr_url = get_full_url(curr_fs.get_datafile_url())

    # delete temp file
    os.unlink(temp_sample_name.name)

    return curr_url
    def _parse_node(self, headers, row):
        '''
        row is a deque, column header is at position len( headers ) - len( row )
        '''
        # TODO: test if this is really a node

        header_components = self._split_header(headers[-len(row)])

        # TODO: for a node the number of header components must be 1
        # assert( len( header_components ) ) == 1

        # try to retrieve this node from the database (unless it is a normalization or data transformation)
        is_new = True

        # name of the node
        node_name = row[0].strip()

        # TODO: remove this once it has been implemented in the preprocessing
        if header_components[
                0] == Node.RAW_DATA_FILE and self.additional_raw_data_file_extension is not None and len(
                    node_name) > 0:
            if not re.search(r'%s$' % self.additional_raw_data_file_extension,
                             node_name):
                node_name += self.additional_raw_data_file_extension

        if (header_components[0] in Node.ASSAYS | {
                Node.SAMPLE, Node.SOURCE, Node.EXTRACT, Node.LABELED_EXTRACT,
                Node.DATA_TRANSFORMATION, Node.NORMALIZATION
        } and len(node_name) > 0) or (header_components[0] in Node.FILES
                                      and len(node_name) > 0):
            if header_components[0] in {Node.SAMPLE, Node.SOURCE}:
                #print "1  --looking up type " + header_components[0] + " =  " +  row[0].strip() + " in study only (" + str( self._current_study ) + ")"
                node, is_new = Node.objects.get_or_create(
                    study=self._current_study,
                    type=header_components[0],
                    name=node_name)
            else:
                #print "2    -- looking up type " + header_components[0] + " =  " +  row[0].strip() + "in study AND assay (" + str( self._current_study ) + ", " +  str( self._current_assay ) + ")"
                node, is_new = Node.objects.get_or_create(
                    study=self._current_study,
                    assay=self._current_assay,
                    type=header_components[0],
                    name=node_name)

            # this node represents a file - add the file to the file store and store the file UUID in the node
            if is_new and header_components[
                    0] in Node.FILES and node_name is not "":

                # create the nodes for the data file in this row
                if self.file_base_path is None:
                    file_path = node_name
                else:
                    # test if this node is refering to a remote url
                    components = urlparse(node_name)
                    if components.scheme == "" or components.netloc == "":
                        # not a remote url
                        file_path = os.path.join(self.file_base_path,
                                                 node_name)
                    else:
                        file_path = node_name

                uuid = create(source=file_path)

                if uuid is not None:
                    node.file_uuid = uuid
                    node.save()
                else:
                    logger.exception("Unable to add " + file_path +
                                     " to file store as a temporary file.")

            if is_new:
                logger.info("New node " + str(node) + " created.")
            else:
                logger.info("Node " + str(node) + " retrieved.")

        else:
            if len(node_name) > 0:
                #print "3      -- looking up type " + header_components[0] + " =  " +  row[0].strip() + "in study AND assay (" + str( self._current_study ) + ", " +  str( self._current_assay ) + ")"
                node = Node.objects.create(study=self._current_study,
                                           assay=self._current_assay,
                                           type=header_components[0],
                                           name=node_name)
            else:
                # do not create empty nodes!
                node = None

        self._current_node = node

        if self._previous_node is not None and self._current_node is not None:
            try:
                # test if the node has already been created (??? why not use an if statement ???)
                node.parents.get(to_node_id=self._previous_node.id)
            except:
                self._previous_node.children.add(node)
                node.parents.add(self._previous_node)
                node.save()
                self._previous_node.save()
        else:
            # TODO: look up parent nodes in DB
            pass

        # remove the node from the row
        row.popleft()

        # read until we hit the next node
        while not self.is_node(headers[-len(row)]):
            if self._current_node is not None:
                if self.is_attribute(headers[-len(row)]):
                    self._parse_attribute(headers, row)
                elif self.is_protocol_reference(headers[-len(row)]):
                    self._parse_protocol_reference(headers, row)
                else:
                    logger.error("Unexpected element " + headers[-len(row)] +
                                 " when parsing node in line " +
                                 str(self._current_reader.line_num) +
                                 ", column " + str(len(headers) - len(row)) +
                                 ".")
                    row.popleft()
            else:  # node is none, pop until the next node because attributes can't be attached to anything
                row.popleft()

        if self._current_node is not None:
            node.save()
            self._previous_node = node
            self._current_node = None

        return node
def createIGVsession(genome, uuids, is_file_uuid=False):
    """ Creates session file for selected file uuids, returns newly created filestore uuid 
    
    :param genome: Genome to be used in session file i.e. hg18, dm3
    :type genome: string.
    :param uuids: Array of UUIDs to be used
    :type uuids: array.
    :param uuids: Host URL i.e. 127.0.0.1:8000
    :type uuids: string
    """
    
    # Create IGV Session file and put into Filestore
    """
    http://www.postneo.com/projects/pyxml/
    
    <?xml version="1.0" encoding="UTF-8"?>
        <Global genome="hg18" locus="EGFR" version="3">
        <Resources>
            <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/tcga/gbm/GBM_batch1-8_level3_exp.txt.recentered.080820.gct.tdf"/>
            <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/annotations/hg18/rna_genes.bed"/>
            <Resource name="sno/miRNA" path="http://www.broadinstitute.org/igvdata/tcga/gbm/Sample_info.txt"/>
        </Resources>
    </Global>
    """
    logger.debug("visualization_manager.createIGVsession called")
    
    # Create the minidom document
    doc = Document()
    
    # Create the <wml> base element
    xml = doc.createElement("Global")
    xml.setAttribute("genome", genome)
    xml.setAttribute("locus", "All")
    xml.setAttribute("version", "4")
    doc.appendChild(xml)
    
    # Add Resources
    xml_resources = doc.createElement("Resources")
    xml.appendChild(xml_resources)
    
    # get paths to url 
    for samp in uuids:
        # gets filestore item 
        curr_name, curr_url = get_file_name(samp, is_file_uuid=is_file_uuid)

        logger.debug( 'New resource: ' + curr_name + ' - ' +  curr_url )
        
        # What to do if fs does not exist? 
        if (curr_name):
            
            # creates Resource element 
            res = doc.createElement("Resource")
            res.setAttribute("name", curr_name)
            res.setAttribute("path", curr_url)
            xml_resources.appendChild(res)
            
    
    # Creating temp file to enter into file_store
    tempfilename = tempfile.NamedTemporaryFile(delete=False)
    tempfilename.write(doc.toprettyxml(indent="  "))
    tempfilename.close()
    
    # getting file_store_uuid
    filestore_uuid = create(tempfilename.name, permanent=True, filetype="xml")
    filestore_item = import_file(filestore_uuid, permanent=True, refresh=True)
    
    # file to rename
    temp_name = filestore_item.datafile.name.split('/')
    temp_name = temp_name[len(temp_name)-1] + '.xml'
    
    # rename file by way of file_store
    filestore_item = rename(filestore_uuid, temp_name)
    
    # delete temp file
    os.unlink(tempfilename.name)
    
    # Print our newly created XML
    #print doc.toprettyxml(indent="  ")
    #print filestore_item.datafile.url
    
    # Url for session file 
    fs_url = filestore_item.get_full_url()
    
    # IGV url for automatic launch of Java Webstart
    igv_url = "http://www.broadinstitute.org/igv/projects/current/igv.php?sessionURL=" + fs_url
    
    return igv_url
示例#12
0
def addIGVSamples(fields, results_samp, annot_samples=None):
    """ creates phenotype file for IGV 
    
    :param samples: Solr results for samples to be included 
    :type samples: Array.
    :param annot_samples: includes annotation files included with solr results
    :type annot_samples: Array
    """
    
    #logger.debug("visualization_manager.views addIGVSamples called, fields=%s" % fields)
    
    # creates human readable indexes of fields to iterate over
    fields_dict = {}
    for i in fields:
        find_index = i.find("_Characteristics_")
        if find_index > -1:
            new_key = i.split("_Characteristics_")[0]
            fields_dict[i] = new_key
    
    # Creating temp file to enter into file_store
    tempsampname = tempfile.NamedTemporaryFile(delete=False)
    
    # writing header to sample file 
    tempsampname.write("#sampleTable" + "\n")
    
    # writing column names to sample file 
    col_names = "Linking_id"
    for k,v in fields_dict.iteritems():
        col_names = col_names + '\t' + v
    tempsampname.write(col_names + "\n")
    
    # iterating over sample files 
    pheno_results = get_sample_lines(fields_dict, results_samp)
    tempsampname.write(pheno_results)
    
    # if annotations are not null
    if annot_samples:
        #results_annot = annot_samples["response"]["docs"]
        pheno_annot = get_sample_lines(fields_dict, annot_samples)
        tempsampname.write(pheno_annot)
        
    # closing temp file 
    tempsampname.close()

    # getting file_store_uuid
    filestore_uuid = create(tempsampname.name, permanent=True, filetype="txt")
    filestore_item = import_file(filestore_uuid, permanent=True, refresh=True)
    
    # file to rename
    temp_file = filestore_item.datafile.name.split('/')
    temp_file = temp_file[len(temp_file)-1] + '.txt'
    
    # rename file by way of file_store
    filestore_item = rename(filestore_uuid, temp_file)
    
    # getting file information based on file_uuids
    curr_fs = FileStoreItem.objects.get(uuid=filestore_uuid)
    curr_name = curr_fs.datafile.name
    
    # full path to selected UUID File
    curr_url = curr_fs.get_full_url()
    
    # delete temp file
    os.unlink(tempsampname.name)
    
    return curr_url
def createIGVsessionAnnot(genome, uuids, annot_uuids=None, samp_file=None):
    """ Creates session file for selected file uuids, returns newly created filestore uuid 
    
    :param genome: Genome to be used in session file i.e. hg18, dm3
    :type genome: string.
    :param uuids: Array of UUIDs to be used
    :type uuids: array.
    :param uuids: Host URL i.e. 127.0.0.1:8000
    :type uuids: string
    """
    
    # Create IGV Session file and put into Filestore
    """
    http://www.postneo.com/projects/pyxml/
    
    <?xml version="1.0" encoding="UTF-8"?>
        <Global genome="hg18" locus="EGFR" version="3">
        <Resources>
            <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/tcga/gbm/GBM_batch1-8_level3_exp.txt.recentered.080820.gct.tdf"/>
            <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/annotations/hg18/rna_genes.bed"/>
            <Resource name="sno/miRNA" path="http://www.broadinstitute.org/igvdata/tcga/gbm/Sample_info.txt"/>
        </Resources>
    </Global>
    """
    #logger.debug("visualization_manager.views createIGVsessionAnnot called")
    
    # Create the minidom document
    doc = Document()
    
    # Create the <wml> base element
    xml = doc.createElement("Global")
    xml.setAttribute("genome", genome)
    xml.setAttribute("locus", "All")
    xml.setAttribute("version", "4")
    doc.appendChild(xml)
    
    # Add Resources
    xml_resources = doc.createElement("Resources")
    xml.appendChild(xml_resources)        
    
    # adding selected samples to xml file
    addIGVResource(uuids["node_uuid"], xml_resources, doc)
    
    if annot_uuids:
        # adding selected samples to xml file
        addIGVResource(annot_uuids["node_uuid"], xml_resources, doc)
        
        
    # adds sample information file to IGV session file 
    if samp_file:
        #<Resource name="Sample Information" path="http://igv.broadinstitute.org/data/hg18/tcga/gbm/gbmsubtypes/sampleTable.txt.gz"/>
        # creates Resource element 
        res = doc.createElement("Resource")
        res.setAttribute("name", "Sample Information")
        res.setAttribute("path", samp_file)
        xml_resources.appendChild(res)    
    
    #<HiddenAttributes>
    #    <Attribute name="DATA FILE"/>
    #    <Attribute name="Linking_id"/>
    #    <Attribute name="DATA TYPE"/>
    #</HiddenAttributes>
    # Adding parameters to hide basic unnecessary sample info
    hidden_attr = doc.createElement("HiddenAttributes")
    xml.appendChild(hidden_attr) 
    
    attr = doc.createElement("Attribute")
    attr.setAttribute("name", "DATA FILE")
    hidden_attr.appendChild(attr) 
    
    attr = doc.createElement("Attribute")
    attr.setAttribute("name", "Linking_id")
    hidden_attr.appendChild(attr) 
    
    attr = doc.createElement("Attribute")
    attr.setAttribute("name", "DATA TYPE")
    hidden_attr.appendChild(attr) 
    
    
    # Creating temp file to enter into file_store
    tempfilename = tempfile.NamedTemporaryFile(delete=False)
    tempfilename.write(doc.toprettyxml(indent="  "))
    tempfilename.close()
    
    # getting file_store_uuid
    filestore_uuid = create(tempfilename.name, permanent=True, filetype="xml")
    filestore_item = import_file(filestore_uuid, permanent=True, refresh=True)
    
    # file to rename
    temp_name = filestore_item.datafile.name.split('/')
    temp_name = temp_name[len(temp_name)-1] + '.xml'
    
    # rename file by way of file_store
    filestore_item = rename(filestore_uuid, temp_name)
    
    # delete temp file
    os.unlink(tempfilename.name)
    
    # Print our newly created XML
    #logger.info( doc.toprettyxml(indent="  "))
    #print filestore_item.datafile.url
    
    # Url for session file 
    fs_url = filestore_item.get_full_url()
    
    # IGV url for automatic launch of Java Webstart
    igv_url = "http://www.broadinstitute.org/igv/projects/current/igv.php?sessionURL=" + fs_url
    
    return igv_url
示例#14
0
def download_history_files(analysis):
    """Download entire histories from galaxy.
    Getting files out of history to file store.

    """
    logger.debug("analysis_manger.download_history_files called")

    # retrieving list of files to download for workflow
    #TODO: handle Django exceptions
    analysis = Analysis.objects.get(uuid=analysis.uuid)
    dl_files = analysis.workflow_dl_files

    ### creating dictionary based on files to download predetermined by workflow w/ keep operators
    dl_dict = {}
    for dl in dl_files.all():
        temp_dict = {}
        temp_dict['filename'] = dl.filename
        temp_dict['pair_id'] = dl.pair_id
        dl_dict[str(dl.step_id)] = temp_dict

    task_list = []
    # gets current galaxy connection
    connection = analysis.get_galaxy_connection()
    try:
        download_list = connection.get_history_file_list(analysis.history_id)
    except RuntimeError as exc:
        error_msg = "Post-processing failed: " + \
            "error downloading Galaxy history files for analysis '{}': {}" \
            .format(analysis.name, exc.message)
        logger.error(error_msg)
        if not isinstance(exc, (ConnectionError, TimeoutError, AuthError)):
            analysis.set_status(Analysis.FAILURE_STATUS, error_msg)
            try:
                analysis.delete_galaxy_library()
                analysis.delete_galaxy_workflow()
                analysis.delete_galaxy_history()
            except RuntimeError:
                logger.error("Cleanup failed for analysis '{}'".format(
                    analysis.name))
        return task_list

    # Iterating through files in current galaxy history
    for results in download_list:
        # download file if result state is "ok"
        if results['state'] == 'ok':
            file_type = results["type"]
            curr_file_id = results['name']

            if curr_file_id in dl_dict:
                curr_dl_dict = dl_dict[curr_file_id]
                result_name = curr_dl_dict['filename'] + '.' + file_type
                # size of file defined by galaxy
                file_size = results['file_size']

                # Determing tag if galaxy results should be download through http or copying files directly
                local_download = analysis.workflow.workflow_engine.instance.local_download

                # to retrieve HTML files as zip archives via dataset URL
                if local_download and file_type != 'html':
                    download_url = results['file_name']
                else:
                    download_url = connection.make_url(str(
                        results['dataset_id']),
                                                       is_data=True,
                                                       key=False)

                # workaround to set the correct file type for zip archives of
                # reports produced by FASTQC
                if file_type == 'html':
                    file_type = 'zip'

                # getting file_store_uuid,
                # TODO: when changing permanent=True, fix update of % download of file
                filestore_uuid = create(source=download_url,
                                        filetype=file_type,
                                        permanent=False)

                # adding history files to django model
                temp_file = AnalysisResult(analysis_uuid=analysis.uuid,
                                           file_store_uuid=filestore_uuid,
                                           file_name=result_name,
                                           file_type=file_type)
                temp_file.save()
                analysis.results.add(temp_file)
                analysis.save()

                # downloading analysis results into file_store
                # only download files if size is greater than 1
                if file_size > 0:
                    #task_id = import_file.subtask((filestore_uuid, True, False, file_size,))
                    # local download, force copying into the file_store instead of symlinking
                    if local_download:
                        task_id = import_file.subtask((
                            filestore_uuid,
                            False,
                            True,
                            file_size,
                        ))
                    else:
                        task_id = import_file.subtask((
                            filestore_uuid,
                            False,
                            False,
                            file_size,
                        ))
                    task_list.append(task_id)

    return task_list
示例#15
0
    def _parse_node(self, headers, row):
        """row is a deque, column header is at position len(headers) - len(row)
        """
        # TODO: test if this is really a node
        header_components = self._split_header(headers[-len(row)])
        # TODO: for a node the number of header components must be 1
        # assert(len(header_components)) == 1

        # try to retrieve this node from the database (unless it is a
        # normalization or data transformation)
        is_new = True

        # name of the node
        node_name = row[0].strip()

        # TODO: remove this once it has been implemented in the preprocessing
        if (header_components[0] == Node.RAW_DATA_FILE and
                self.additional_raw_data_file_extension is not None and
                len(node_name) > 0):
            if not re.search(
                        r'%s$' % self.additional_raw_data_file_extension,
                        node_name
                    ):
                node_name += self.additional_raw_data_file_extension

        if (header_components[0] in Node.ASSAYS |
            {Node.SAMPLE, Node.SOURCE, Node.EXTRACT, Node.LABELED_EXTRACT,
             Node.DATA_TRANSFORMATION, Node.NORMALIZATION} and
                len(node_name) > 0) or \
                (header_components[0] in Node.FILES and len(node_name) > 0):
            if header_components[0] in {Node.SAMPLE, Node.SOURCE}:
                node, is_new = Node.objects.get_or_create(
                    study=self._current_study,
                    type=header_components[0],
                    name=node_name)
            else:
                node, is_new = Node.objects.get_or_create(
                    study=self._current_study,
                    assay=self._current_assay,
                    type=header_components[0],
                    name=node_name)
            # this node represents a file - add the file to the file store and
            # store the file UUID in the node
            if (is_new and
                    header_components[0] in Node.FILES and
                    node_name is not ""):
                # create the nodes for the data file in this row
                if self.file_base_path is None:
                    file_path = node_name
                else:
                    # test if this node is referring to a remote url
                    components = urlparse(node_name)
                    if components.scheme == "" or components.netloc == "":
                        # not a remote url
                        file_path = os.path.join(
                            self.file_base_path, node_name)
                    else:
                        file_path = node_name

                uuid = create(source=file_path)

                if uuid is not None:
                    node.file_uuid = uuid
                    node.save()
                else:
                    logger.exception(
                        "Unable to add " + file_path + " to file store as a "
                        "temporary file.")
            if is_new:
                logger.info("New node " + str(node) + " created.")
            else:
                logger.info("Node " + str(node) + " retrieved.")
        else:
            if len(node_name) > 0:
                node = Node.objects.create(
                    study=self._current_study,
                    assay=self._current_assay,
                    type=header_components[0],
                    name=node_name)
            else:
                # do not create empty nodes!
                node = None

        self._current_node = node

        if self._previous_node is not None and self._current_node is not None:
            try:
                # test if the node has already been created (??? why not use an
                # if statement ???)
                node.parents.get(to_node_id=self._previous_node.id)
            except:
                self._previous_node.children.add(node)
                node.parents.add(self._previous_node)
                node.save()
                self._previous_node.save()
        else:
            # TODO: look up parent nodes in DB
            pass

        # remove the node from the row
        row.popleft()
        # read until we hit the next node
        while not self.is_node(headers[-len(row)]):
            if self._current_node is not None:
                if self.is_attribute(headers[-len(row)]):
                    self._parse_attribute(headers, row)
                elif self.is_protocol_reference(headers[-len(row)]):
                    self._parse_protocol_reference(headers, row)
                else:
                    logger.error(
                        "Unexpected element " + headers[-len(row)] + " when "
                        "parsing node in line " +
                        str(self._current_reader.line_num) + ", column " +
                        str(len(headers) - len(row)) + ".")
                    row.popleft()
            else:
                # node is none, pop until the next node because attributes
                # can't be attached to anything
                row.popleft()
        if self._current_node is not None:
            node.save()
            self._previous_node = node
            self._current_node = None

        return node
示例#16
0
def get_galaxy_download_tasks(analysis):
    """Get file import tasks for Galaxy analysis results"""
    logger.debug("Preparing to download analysis results from Galaxy")

    # retrieving list of files to download for workflow
    dl_files = analysis.workflow_dl_files
    # creating dictionary based on files to download predetermined by workflow
    # w/ keep operators
    dl_dict = {}
    for dl in dl_files.all():
        temp_dict = {}
        temp_dict['filename'] = dl.filename
        temp_dict['pair_id'] = dl.pair_id
        dl_dict[str(dl.step_id)] = temp_dict
    task_list = []
    galaxy_instance = analysis.workflow.workflow_engine.instance
    try:
        download_list = galaxy_instance.get_history_file_list(
            analysis.history_id)
    except galaxy.client.ConnectionError as exc:
        error_msg = "Error downloading Galaxy history files for analysis " \
                    "'%s': %s"
        logger.error(error_msg, analysis.name, exc.message)
        analysis.set_status(Analysis.FAILURE_STATUS, error_msg)
        analysis.galaxy_cleanup()
        return task_list
    # Iterating through files in current galaxy history
    for results in download_list:
        # download file if result state is "ok"
        if results['state'] == 'ok':
            file_type = results["type"]
            curr_file_id = results['name']
            if curr_file_id in dl_dict:
                curr_dl_dict = dl_dict[curr_file_id]
                result_name = curr_dl_dict['filename'] + '.' + file_type
                # size of file defined by galaxy
                file_size = results['file_size']
                # Determining tag if galaxy results should be download through
                # http or copying files directly to retrieve HTML files as zip
                # archives via dataset URL
                if galaxy_instance.local_download and file_type != 'html':
                    download_url = results['file_name']
                else:
                    download_url = urlparse.urljoin(
                            galaxy_instance.base_url, '/'.join(
                                    ['datasets', str(results['dataset_id']),
                                     'display?to_ext=txt']))
                # workaround to set the correct file type for zip archives of
                # FastQC HTML reports produced by Galaxy dynamically
                if file_type == 'html':
                    file_type = 'zip'
                # TODO: when changing permanent=True, fix update of % download
                # of file
                filestore_uuid = create(
                    source=download_url, filetype=file_type, permanent=False)
                # adding history files to django model
                temp_file = AnalysisResult(
                    analysis_uuid=analysis.uuid,
                    file_store_uuid=filestore_uuid,
                    file_name=result_name, file_type=file_type)
                temp_file.save()
                analysis.results.add(temp_file)
                analysis.save()
                # downloading analysis results into file_store
                # only download files if size is greater than 1
                if file_size > 0:
                    # local download, force copying into the file_store instead
                    # of symlinking
                    if galaxy_instance.local_download:
                        task_id = import_file.subtask(
                            (filestore_uuid, False, True, file_size,))
                    else:
                        task_id = import_file.subtask(
                            (filestore_uuid, False, False, file_size,))
                    task_list.append(task_id)

    return task_list
示例#17
0
    def run(self, path, isa_archive=None, preisa_archive=None):
        """If path is a file it will be treated as an ISArchive, if it is a
        directory it will be treated as an extracted ISArchive. Assumes that
        the archive extracts into a subdirectory named <archive> if the
        ISArchive is called <archive>.zip.
        """
        # reset all variables
        self._current_investigation = None
        self._current_study = None
        self._current_assay = None
        self._current_node = None
        self._previous_node = None
        self._current_attribute = None
        self._current_protocol_reference = None
        self._current_reader = None
        self._current_file = None
        self._current_file_name = None
        # 1. test if archive needs to be extracted and extract if necessary
        if not os.path.isdir(path):
            # assign to isa_archive if it's an archive anyway
            isa_archive = path
            logger.info(
                "Supplied path \"" + path + "\" is not a directory. Assuming "
                "ISArchive file.")
            try:
                # TODO: do we need a random subdirectory here?
                extract_path = tempfile.mkdtemp()
                with ZipFile(path, 'r') as zip:
                    # test if any paths are relative or absolute and outside
                    # the extract path
                    for name in zip.namelist():
                        if name.startswith("..") or name.startswith("/"):
                            logger.exception(
                                "Unable to extract assumed ISArchive file \"" +
                                path + "\" due to illegal file path: " + name
                            )
                    # extract archive
                    zip.extractall(extract_path)
                    first_file = zip.namelist()[0]
                    # test if first entry in zip file is a path
                    if first_file.endswith("/"):
                        # add archive subdirectory to path
                        extract_path = os.path.join(extract_path, first_file)
                    elif re.search(r'/', first_file):
                        ind = string.find(first_file, '/')
                        extract_path = os.path.join(
                            extract_path,
                            first_file[:ind]
                        )

                    logger.info(
                        "ISArchive extracted to \"" + extract_path + "\"."
                    )
                    path = extract_path
            except:
                logger.exception(
                    "Unable to extract assumed ISArchive file \"" + path +
                    "\".")
        # 2. identify investigation file
        try:
            investigation_file_name = glob.glob("%s/i*.txt" % path).pop()
        except IndexError as exception:
            logger.exception(
                "Unable to identify ISArchive file in \"" + path + "\".")
            raise exception
        # 3. parse investigation file and identify study files and
        # corresponding assay files
        self._parse_investigation_file(investigation_file_name)
        # 4. parse all study files and corresponding assay files
        if self._current_investigation is not None:
            # identify studies associated with this investigation
            for study in self._current_investigation.study_set.all():
                # parse study file
                self._current_assay = None
                study_file_name = os.path.join(path, study.file_name)
                if data_set_manager.tasks.fix_last_col(study_file_name):
                    self._parse_study_file(study, study_file_name)
                    for assay in study.assay_set.all():
                        # parse assay file
                        self._previous_node = None
                        assay_file_name = os.path.join(path, assay.file_name)
                        if data_set_manager.tasks.fix_last_col(
                                assay_file_name):
                            self._parse_assay_file(
                                study,
                                assay,
                                assay_file_name)
        else:
            logger.exception(
                "No investigation was identified when parsing investigation "
                "file \"" + investigation_file_name + "\"")
            raise Exception()
        # 5. assign ISA-Tab archive and pre-ISA-Tab archive if present
        try:
            self._current_investigation.isarchive_file = create(isa_archive)
            import_file(self._current_investigation.isarchive_file,
                        refresh=True)
        except:
            pass

        if preisa_archive:
            self._current_investigation.pre_isarchive_file = \
                create(preisa_archive)
            import_file(self._current_investigation.pre_isarchive_file,
                        refresh=True)

        self._current_investigation.save()
        return self._current_investigation
示例#18
0
    def _parse_file(self, file_name ):
        try:
            self._current_file =  open( file_name, "rU" )
            self._current_reader = csv.reader( self._current_file, dialect="excel-tab", delimiter=self.delimiter )
        except:
            logger.exception( "Unable to read file " + str( self._current_file ) + "." )
        
        # create investigation, study and assay objects
        investigation = self._create_investigation()
        study = self._create_study( investigation=investigation, file_name=file_name )                
        assay = self._create_assay( study=study, file_name=file_name )
        
        #import in file as "pre-isa" file
        logger.info('trying to add pre-isa archive file %s' % file_name)
        investigation.pre_isarchive_file = create(file_name, permanent=True)
        import_file(investigation.pre_isarchive_file, refresh=True, permanent=True)
        investigation.save()
            
        # read column headers
        headers = []
        headers = self._current_reader.next()
        
        # compute absolute file_column_index (in case a negative value was provided)
        if self.file_column_index >= 0:
            internal_file_column_index = self.file_column_index
        else:                
            internal_file_column_index = len( headers ) + self.file_column_index

        # compute absolute auxiliary_file_column_index (in case a negative value was provided)
        if self.auxiliary_file_column_index is not None:
            if self.auxiliary_file_column_index >= 0:
                internal_auxiliary_file_column_index = self.auxiliary_file_column_index
            else:                
                internal_auxiliary_file_column_index = len( headers ) + self.auxiliary_file_column_index
        else:
            internal_auxiliary_file_column_index = None
        
        # TODO: test if there are fewer columns than required        
        logger.debug( "Parsing with file column %s and auxiliary file column %s." % ( internal_file_column_index, internal_auxiliary_file_column_index ) )
        
        # iterate over non-header rows in file
        for row in self._current_reader:
            
                
            # TODO: resolve relative indices
            internal_source_column_index = self.source_column_index            
            internal_sample_column_index = self.sample_column_index            
            internal_assay_column_index = self.assay_column_index            
                

            # add data file to file store
            file_uuid = None            

            if self.file_base_path is None:
                file_path = row[internal_file_column_index].strip()
            else:
                file_path = os.path.join( self.file_base_path, row[internal_file_column_index].strip() )

            file_uuid = create( source=file_path, permanent=self.file_permanent )
                                    
            if file_uuid is not None:
                logger.debug( "Added data file " + file_path + " to file store." )
            else:
                logger.exception( "Unable to add data file " + file_path + " to file store." )            


            # add auxiliary file to file store
            auxiliary_file_uuid = None
            
            if internal_auxiliary_file_column_index is not None:
                if self.file_base_path is None:
                    auxiliary_file_path = row[internal_auxiliary_file_column_index].strip()
                else:
                    auxiliary_file_path = os.path.join( self.file_base_path, row[internal_auxiliary_file_column_index].strip() )
                    
                auxiliary_file_uuid = create( source=auxiliary_file_path, permanent=self.file_permanent )
    
                if auxiliary_file_uuid is not None:
                    logger.debug( "Added auxiliary file " + auxiliary_file_path + "  to file store." )
                else:
                    logger.exception( "Unable to add auxiliary file " + file_path + " to file store." )
                    
                
            # add files to file server
            file_server.models.add( file_uuid, auxiliary_file_uuid );

            # create nodes if file was successfully created
            
            # source node
            source_name = self._create_name(row, internal_source_column_index, internal_file_column_index)                                
            source_node, is_source_new = Node.objects.get_or_create(
                study=study,
                name=source_name,
                type=Node.SOURCE )

            # sample node
            sample_name = self._create_name(row, internal_sample_column_index, internal_file_column_index)                                
            sample_node, is_sample_new = Node.objects.get_or_create(
                study=study,
                name=sample_name,
                type=Node.SAMPLE )            
            source_node.add_child( sample_node )

            # assay node
            assay_name = self._create_name(row, internal_assay_column_index, internal_file_column_index)                                
            assay_node, is_assay_new = Node.objects.get_or_create(
                study=study,
                assay=assay,
                name=assay_name,
                type=Node.ASSAY )            
            sample_node.add_child( assay_node )
            
            file_node = Node.objects.create(
                study=study,
                assay=assay,
                name=row[internal_file_column_index].strip(),
                file_uuid=file_uuid,
                type=Node.RAW_DATA_FILE,
                species=self._get_species( row ),
                genome_build=self._get_genome_build( row ),
                is_annotation=self._is_annotation( row ) )
            assay_node.add_child( file_node )
            
            # iterate over columns to create attributes to attach to the sample node
            for column_index in range( 0, len( row ) ):
                # skip data file column
                if ( internal_file_column_index == column_index ) or ( internal_auxiliary_file_column_index == column_index ) or ( self.annotation_column_index == column_index ):
                    continue
                
                # create attribute as characteristic and attach to sample node if the sample node was newly created
                if is_sample_new:
                    attribute = Attribute.objects.create(
                        node=sample_node,
                        type=Attribute.CHARACTERISTICS,
                        subtype=headers[column_index].strip().lower(),
                        value=row[column_index].strip() )             
             
        return investigation
    def run(self, path, isa_archive=None, preisa_archive=None):
        '''
        If path is a file it will be treated as an ISArchive, if it is a directory it will be treated
        as an extracted ISArchive. Assumes that the archive extracts into a subdirectory named <archive> if the
        ISArchive is called <archive>.zip.
        '''

        # reset all variables
        self._current_investigation = None
        self._current_study = None
        self._current_assay = None
        self._current_node = None
        self._previous_node = None
        self._current_attribute = None
        self._current_protocol_reference = None
        self._current_reader = None
        self._current_file = None
        self._current_file_name = None

        # 1. test if archive needs to be extracted and extract if necessary
        if not os.path.isdir(path):
            #assign to isa_archive if it's an archive anyway
            isa_archive = path
            logger.info("Supplied path \"" + path +
                        "\" is not a directory. Assuming ISArchive file.")
            try:
                # TODO: do we need a random subdirectory here?
                extract_path = tempfile.mkdtemp()
                with ZipFile(path, 'r') as zip:
                    # test if any paths are relative or absolute and outside the extract path
                    for name in zip.namelist():
                        if name.startswith("..") or name.startswith("/"):
                            logger.exception(
                                "Unable to extract assumed ISArchive file \"" +
                                path + "\" due to illegal file path: " + name)

                    # extract archive
                    zip.extractall(extract_path)

                    first_file = zip.namelist()[0]
                    # test if first entry in zip file is a path
                    if first_file.endswith("/"):
                        # add archive subdirectory to path
                        extract_path = os.path.join(extract_path, first_file)
                    elif re.search(r'/', first_file):
                        ind = string.find(first_file, '/')
                        extract_path = os.path.join(extract_path,
                                                    first_file[:ind])

                    logger.info("ISArchive extracted to \"" + extract_path +
                                "\".")
                    path = extract_path
            except:
                logger.exception(
                    "Unable to extract assumed ISArchive file \"" + path +
                    "\".")

        # 2. identify investigation file
        try:
            investigation_file_name = glob.glob("%s/i*.txt" % path).pop()
        except IndexError as exception:
            logger.exception("Unable to identify ISArchive file in \"" + path +
                             "\".")
            raise exception

        # 3. parse investigation file and identify study files and corresponding assay files
        self._parse_investigation_file(investigation_file_name)

        # 4. parse all study files and corresponding assay files
        if self._current_investigation is not None:
            # identify studies associated with this investigation
            for study in self._current_investigation.study_set.all():
                # parse study file
                self._current_assay = None
                study_file_name = os.path.join(path, study.file_name)
                if data_set_manager.tasks.fix_last_col(study_file_name):
                    self._parse_study_file(study, study_file_name)
                    for assay in study.assay_set.all():
                        # parse assay file
                        self._previous_node = None
                        assay_file_name = os.path.join(path, assay.file_name)
                        if data_set_manager.tasks.fix_last_col(
                                assay_file_name):
                            self._parse_assay_file(study, assay,
                                                   assay_file_name)
        else:
            logger.exception(
                "No investigation was identified when parsing investigation file \""
                + investigation_file_name + "\"")
            raise Exception()

        #assign ISA-Tab archive and pre-ISA-Tab archive if present
        try:
            self._current_investigation.isarchive_file = create(isa_archive,
                                                                permanent=True)
        except:
            pass

        if preisa_archive:
            self._current_investigation.pre_isarchive_file = create(
                preisa_archive, permanent=True)

        self._current_investigation.save()

        return self._current_investigation
    def run(self):
        # create investigation, study and assay objects
        investigation = self._create_investigation()
        # FIXME: self.metadata_file.name may not be informative, especially in
        # case of temp files that don't exist on disk
        study = self._create_study(investigation=investigation,
                                   file_name=self.metadata_file.name)
        assay = self._create_assay(study=study,
                                   file_name=self.metadata_file.name)

        # import in file as "pre-isa" file
        logger.info("trying to add pre-isa archive file %s",
                    self.metadata_file.name)
        # FIXME: this will not create a FileStoreItem if self.metadata_file
        # does not exist on disk (e.g., a file object like TemporaryFile)
        investigation.pre_isarchive_file = create(
            self.metadata_file.name, permanent=True)
        import_file(investigation.pre_isarchive_file, refresh=True)
        investigation.save()

        # TODO: test if there are fewer columns than required
        logger.debug("Parsing with file column %s and "
                     "auxiliary file column %s",
                     self.file_column_index, self.auxiliary_file_column_index)
        # UUIDs of data files to postpone importing until parsing is finished
        data_files = []
        # iterate over non-header rows in file
        for row in self.metadata_reader:
            # TODO: resolve relative indices
            internal_source_column_index = self.source_column_index
            internal_sample_column_index = self.sample_column_index
            internal_assay_column_index = self.assay_column_index
            # add data file to file store
            data_file_path = self.file_source_translator(
                row[self.file_column_index])
            data_file_uuid = create(
                source=data_file_path, permanent=self.file_permanent)
            data_files.append(data_file_uuid)
            # add auxiliary file to file store
            if self.auxiliary_file_column_index:
                auxiliary_file_path = self.file_source_translator(
                    row[self.auxiliary_file_column_index])
                auxiliary_file_uuid = create(
                    source=auxiliary_file_path, permanent=self.file_permanent)
                data_files.append(auxiliary_file_uuid)
            else:
                auxiliary_file_uuid = None
            # add files to file server
            # TODO: add error handling in case of None values for UUIDs
            file_server.models.add(data_file_uuid, auxiliary_file_uuid)
            # create nodes if file was successfully created
            # source node
            source_name = self._create_name(
                row, internal_source_column_index, self.file_column_index)
            source_node, is_source_new = Node.objects.get_or_create(
                study=study, name=source_name, type=Node.SOURCE)
            # sample node
            sample_name = self._create_name(
                row, internal_sample_column_index, self.file_column_index)
            sample_node, is_sample_new = Node.objects.get_or_create(
                study=study, name=sample_name, type=Node.SAMPLE)
            source_node.add_child(sample_node)
            # assay node
            assay_name = self._create_name(
                row, internal_assay_column_index, self.file_column_index)
            assay_node, is_assay_new = Node.objects.get_or_create(
                study=study, assay=assay, name=assay_name, type=Node.ASSAY)
            sample_node.add_child(assay_node)
            file_node = Node.objects.create(
                study=study, assay=assay,
                name=row[self.file_column_index].strip(),
                file_uuid=data_file_uuid, type=Node.RAW_DATA_FILE,
                species=self._get_species(row),
                genome_build=self._get_genome_build(row),
                is_annotation=self._is_annotation(row))
            assay_node.add_child(file_node)
            # iterate over columns to create attributes to attach to sample
            # node
            for column_index in range(0, len(row)):
                # skip data file column
                if (self.file_column_index == column_index or
                        self.auxiliary_file_column_index == column_index or
                        self.annotation_column_index == column_index):
                    continue
                # create attribute as characteristic and attach to sample node
                # if the sample node was newly created
                if is_sample_new:
                    Attribute.objects.create(
                        node=sample_node, type=Attribute.CHARACTERISTICS,
                        subtype=self.headers[column_index].strip().lower(),
                        value=row[column_index].strip()
                    )
        # kick off data file importing tasks
        for uuid in data_files:
            import_file.delay(uuid)
        return investigation
示例#21
0
    def run(self):
        # create investigation, study and assay objects
        investigation = self._create_investigation()
        # FIXME: self.metadata_file.name may not be informative, especially in
        # case of temp files that don't exist on disk
        study = self._create_study(investigation=investigation,
                                   file_name=self.metadata_file.name)
        assay = self._create_assay(study=study,
                                   file_name=self.metadata_file.name)

        # import in file as "pre-isa" file
        logger.info("trying to add pre-isa archive file %s",
                    self.metadata_file.name)
        # FIXME: this will not create a FileStoreItem if self.metadata_file
        # does not exist on disk (e.g., a file object like TemporaryFile)
        investigation.pre_isarchive_file = create(self.metadata_file.name)
        import_file(investigation.pre_isarchive_file, refresh=True)
        investigation.save()

        # TODO: test if there are fewer columns than required
        logger.debug(
            "Parsing with file column %s and "
            "auxiliary file column %s", self.file_column_index,
            self.auxiliary_file_column_index)
        # UUIDs of data files to postpone importing until parsing is finished
        data_files = []
        # iterate over non-header rows in file
        for row in self.metadata_reader:
            # TODO: resolve relative indices
            internal_source_column_index = self.source_column_index
            internal_sample_column_index = self.sample_column_index
            internal_assay_column_index = self.assay_column_index
            # add data file to file store
            data_file_path = self.file_source_translator(
                row[self.file_column_index])
            data_file_uuid = create(source=data_file_path)
            data_files.append(data_file_uuid)
            # add auxiliary file to file store
            if self.auxiliary_file_column_index:
                auxiliary_file_path = self.file_source_translator(
                    row[self.auxiliary_file_column_index])
                auxiliary_file_uuid = create(source=auxiliary_file_path)
                data_files.append(auxiliary_file_uuid)
            else:
                auxiliary_file_uuid = None
            # add files to file server
            # TODO: add error handling in case of None values for UUIDs
            file_server.models.add(data_file_uuid, auxiliary_file_uuid)
            # create nodes if file was successfully created
            # source node
            source_name = self._create_name(row, internal_source_column_index,
                                            self.file_column_index)
            source_node, is_source_new = Node.objects.get_or_create(
                study=study, name=source_name, type=Node.SOURCE)
            # sample node
            sample_name = self._create_name(row, internal_sample_column_index,
                                            self.file_column_index)
            sample_node, is_sample_new = Node.objects.get_or_create(
                study=study, name=sample_name, type=Node.SAMPLE)
            source_node.add_child(sample_node)
            # assay node
            assay_name = self._create_name(row, internal_assay_column_index,
                                           self.file_column_index)
            assay_node, is_assay_new = Node.objects.get_or_create(
                study=study, assay=assay, name=assay_name, type=Node.ASSAY)
            sample_node.add_child(assay_node)
            file_node = Node.objects.create(
                study=study,
                assay=assay,
                name=row[self.file_column_index].strip(),
                file_uuid=data_file_uuid,
                type=Node.RAW_DATA_FILE,
                species=self._get_species(row),
                genome_build=self._get_genome_build(row),
                is_annotation=self._is_annotation(row))
            assay_node.add_child(file_node)
            # iterate over columns to create attributes to attach to sample
            # node
            for column_index in range(0, len(row)):
                # skip data file column
                if (self.file_column_index == column_index
                        or self.auxiliary_file_column_index == column_index
                        or self.annotation_column_index == column_index):
                    continue
                # create attribute as characteristic and attach to sample node
                # if the sample node was newly created
                if is_sample_new:
                    Attribute.objects.create(
                        node=sample_node,
                        type=Attribute.CHARACTERISTICS,
                        subtype=self.headers[column_index].strip().lower(),
                        value=row[column_index].strip())

        # Start remote file import tasks if `Make Import Permanent:` flag set
        # by the user
        # Likewise, we'll try to import these files if their source begins with
        # our REFINERY_DATA_IMPORT_DIR setting (This will be the case if
        # users upload datafiles associated with their metadata)

        for uuid in data_files:
            try:
                file_store_item = FileStoreItem.objects.get(uuid=uuid)
            except (FileStoreItem.DoesNotExist,
                    FileStoreItem.MultipleObjectsReturned) as e:
                logger.error("Couldn't properly fetch FileStoreItem %s", e)
            else:

                if (self.file_permanent or file_store_item.source.startswith(
                    (settings.REFINERY_DATA_IMPORT_DIR, 's3://'))):
                    import_file.delay(uuid)

        return investigation
def download_history_files(analysis) :
    """Download entire histories from galaxy.
    Getting files out of history to file store.

    """
    logger.debug("analysis_manger.download_history_files called")

    # retrieving list of files to download for workflow
    #TODO: handle Django exceptions
    analysis = Analysis.objects.get(uuid=analysis.uuid)
    dl_files = analysis.workflow_dl_files

    ### creating dictionary based on files to download predetermined by workflow w/ keep operators
    dl_dict = {}
    for dl in dl_files.all():
        temp_dict = {}
        temp_dict['filename'] = dl.filename
        temp_dict['pair_id'] = dl.pair_id
        dl_dict[str(dl.step_id)] = temp_dict

    task_list = []
    # gets current galaxy connection
    connection = analysis.get_galaxy_connection()
    try:
        download_list = connection.get_history_file_list(analysis.history_id)
    except RuntimeError as exc:
        error_msg = "Post-processing failed: " + \
            "error downloading Galaxy history files for analysis '{}': {}" \
            .format(analysis.name, exc.message)
        logger.error(error_msg)
        if not isinstance(exc, (ConnectionError, TimeoutError, AuthError)):
            analysis.set_status(Analysis.FAILURE_STATUS, error_msg)
            try:
                analysis.delete_galaxy_library()
                analysis.delete_galaxy_workflow()
                analysis.delete_galaxy_history()
            except RuntimeError:
                logger.error(
                    "Cleanup failed for analysis '{}'".format(analysis.name))
        return task_list

    # Iterating through files in current galaxy history
    for results in download_list:
        # download file if result state is "ok"
        if results['state'] == 'ok':
            file_type = results["type"]
            curr_file_id = results['name']

            if curr_file_id in dl_dict:
                curr_dl_dict = dl_dict[curr_file_id]
                result_name = curr_dl_dict['filename'] + '.' + file_type
                # size of file defined by galaxy
                file_size = results['file_size']

                # Determing tag if galaxy results should be download through http or copying files directly
                local_download = analysis.workflow.workflow_engine.instance.local_download

                # to retrieve HTML files as zip archives via dataset URL
                if local_download and file_type != 'html':
                    download_url = results['file_name']
                else:
                    download_url = connection.make_url(
                        str(results['dataset_id']), is_data=True, key=False)

                # workaround to set the correct file type for zip archives of
                # reports produced by FASTQC
                if file_type == 'html':
                    file_type = 'zip'

                # getting file_store_uuid,
                # TODO: when changing permanent=True, fix update of % download of file 
                filestore_uuid = create(
                    source=download_url,
                    filetype=file_type,
                    permanent=False
                )

                # adding history files to django model 
                temp_file = AnalysisResult(
                    analysis_uuid=analysis.uuid, file_store_uuid=filestore_uuid,
                    file_name=result_name, file_type=file_type)
                temp_file.save()
                analysis.results.add(temp_file) 
                analysis.save()
                
                # downloading analysis results into file_store
                # only download files if size is greater than 1
                if file_size > 0:
                    #task_id = import_file.subtask((filestore_uuid, True, False, file_size,))
                    # local download, force copying into the file_store instead of symlinking
                    if local_download:
                        task_id = import_file.subtask(
                            (filestore_uuid, False, True, file_size,))
                    else:
                        task_id = import_file.subtask(
                            (filestore_uuid, False, False, file_size,))
                    task_list.append(task_id)

    return task_list