Exemplo n.º 1
0
 def download_file(self, url):
     """Fetches a remote file. Uses the cookies file from cURL to
        authenticate."""
     self.mainlog.debug('Fetching ' + url)
     #   Build the full URL to fetch
     full_url = self.DL_BASE + url
     #   And build the command to download it
     cmd = [
         'curl',
         full_url,
         '-b',
         self.cookie.name,
         '-o',
         file_funcs.local_name(url)
         ]
     #   Then download it
     p = subprocess.Popen(
         cmd,
         shell=False,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE
         )
     out, err = p.communicate()
     self.mainlog.debug('Done fetching ' + url)
     return
Exemplo n.º 2
0
 def get_xml_urls(self):
     """Gets the URLs and th MD5s of the CDS files from the XML file from
        Phytozome. Stores these data in `urls' and `md5s' respectively."""
     self.mainlog.debug('Fetching XML')
     #   Create another temporary named file for the XML output
     xml_out = tempfile.NamedTemporaryFile(
         mode='w+t',
         prefix='BAD_Mutations_JGI_XML_',
         suffix='.xml',
         delete=False)
     self.mainlog.debug('XML will be stored in ' + xml_out.name)
     #   Use cURL to download the XML, passing the cookies we generated
     #   earlier to authenticate.
     cmd = [
         'curl',
         self.XML_URL,
         '-b',
         self.cookie.name,
         '-o',
         xml_out.name
         ]
     #   Execute the command
     p = subprocess.Popen(
         cmd,
         shell=False,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE)
     out, err = p.communicate()
     self.mainlog.debug('cURL stdout: ' + out.decode('utf-8'))
     self.mainlog.debug('cURL stderr: ' + err.decode('utf-8'))
     #   Then, read the XML back from the file
     xml = xml_out.read()
     #   This suffix is what we want the filenames ending with
     #   this can change, depending on the target of the LRT
     suffix = '.cds.fa.gz'
     #   Use HTTP GET to fetch the XML from Phytozome's server
     #   This is also a response obkect
     self.mainlog.debug('The XML I got was \n\n' + xml)
     #   Create an element tree out of it, so we can easily step
     #   through the data
     xml_tree = ElementTree.fromstring(xml)
     #   Step through it and extract all CDS URLs
     for elem in xml_tree.findall('.//file'):
         #   if the URL ends in a certain suffix, then save it
         if elem.attrib.get('url').endswith(suffix):
             url = elem.attrib.get('url')
             md5 = elem.attrib.get('md5')
             #   Check to see that the file is in the list of
             #   species to download
             local_filename = file_funcs.local_name(url)
             species_name = file_funcs.species_name(local_filename)
             if species_name in self.TO_FETCH:
                 self.urls.append(url)
                 self.md5s.append(md5)
     self.mainlog.debug('Found ' + str(len(self.urls)) + ' files to fetch')
     return
Exemplo n.º 3
0
 def get_xml_urls(self):
     """Gets the URLs and th MD5s of the CDS files from the XML file from
        Phytozome. Stores these data in `urls' and `md5s' respectively."""
     self.mainlog.debug('Fetching XML')
     #   Create another temporary named file for the XML output
     xml_out = tempfile.NamedTemporaryFile(
         mode='w+t',
         prefix='BAD_Mutations_JGI_XML_',
         suffix='.xml',
         delete=False)
     #   Use cURL to download the XML, passing the cookies we generated
     #   earlier to authenticate.
     cmd = [
         'curl',
         self.XML_URL,
         '-b',
         self.cookie.name,
         '-o',
         xml_out.name
         ]
     #   Execute the command
     p = subprocess.Popen(
         cmd,
         shell=False,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE)
     out, err = p.communicate()
     #   Then, read the XML back from the file
     xml = xml_out.read()
     #   This suffix is what we want the filenames ending with
     #   this can change, depending on the target of the LRT
     suffix = '.cds.fa.gz'
     #   Use HTTP GET to fetch the XML from Phytozome's server
     #   This is also a response obkect
     self.mainlog.debug('The XML I got was \n\n' + xml)
     #   Create an element tree out of it, so we can easily step
     #   through the data
     xml_tree = ElementTree.fromstring(xml)
     #   Step through it and extract all CDS URLs
     for elem in xml_tree.findall('.//file'):
         #   if the URL ends in a certain suffix, then save it
         if elem.attrib.get('url').endswith(suffix):
             url = elem.attrib.get('url')
             md5 = elem.attrib.get('md5')
             #   Check to see that the file is in the list of
             #   species to download
             local_filename = file_funcs.local_name(url)
             species_name = file_funcs.species_name(local_filename)
             if species_name in self.TO_FETCH:
                 self.urls.append(url)
                 self.md5s.append(md5)
     self.mainlog.debug('Found ' + str(len(self.urls)) + ' files to fetch')
     return
Exemplo n.º 4
0
 def download_files(self):
     """Iterate through the list of URLs and download the appropriate
     files. Computes the CRC sum of existing files and compares them to
     the remote checksum to decide whether or not to to download."""
     #   For each URL we have:
     for u, c in zip(self.urls, self.cksums):
         target_dir = self.make_species_dir(u)
         #   cd into it
         os.chdir(target_dir)
         #   What is the local file name?
         lname = file_funcs.local_name(u)
         #   If it exists, we check if the checksums are the same
         if file_funcs.file_exists(lname, self.mainlog):
             local_cksum = file_funcs.calculate_crc32(lname, self.mainlog)
             crc32_same = file_funcs.checksum_is_same(
                 local_cksum, c, self.mainlog)
             if crc32_same:
                 self.mainlog.info(
                     lname + ' already exists and is current, skipping.')
                 continue
             else:
                 self.mainlog.info(lname +
                                   ' exists, but is out of date. Updating.')
                 same = False
                 while not same:
                     self.get_file(u)
                     new_local_cksum = file_funcs.calculate_crc32(
                         lname, self.mainlog)
                     same = file_funcs.checksum_is_same(
                         new_local_cksum, c, self.mainlog)
                 #   And save a record for those that need to be converted
                 self.to_convert.append(
                     os.path.join(self.base, target_dir, lname))
         #   If the file doesn't exist, then it's the same
         #   as if the checksum were different
         else:
             self.mainlog.info(lname + ' does not exist. Downloading.')
             same = False
             while not same:
                 self.get_file(u)
                 new_local_cksum = file_funcs.calculate_crc32(
                     lname, self.mainlog)
                 same = file_funcs.checksum_is_same(new_local_cksum, c,
                                                    self.mainlog)
             self.to_convert.append(
                 os.path.join(self.base, target_dir, lname))
     self.mainlog.info('Done downloading CDS files from Ensembl.')
     #   We are done with the FTP connection, log out
     self.session.quit()
     return
Exemplo n.º 5
0
 def get_file(self, fname):
     """Download the file specified by `fname'"""
     handle = open(file_funcs.local_name(fname), 'wb')
     self.session.retrbinary('RETR ' + fname, handle.write)
     handle.close()
     return
Exemplo n.º 6
0
 def fetch_cds(self):
     """Iterates through the urls and md5s instance attributes and
        downloads the appropriate files. Checks the local MD5 against the
        remote MD5 and downloads the remote file if they differ. Appends
        the filenames of each updated file to the `to_convert' attribute."""
     self.mainlog.debug('Downloading files from ' +
                        str(len(self.urls)) +
                        ' species')
     for u, m in zip(self.urls, self.md5s):
         #   Get a local name of the CDS
         lname = file_funcs.local_name(u)
         target_dir = self.make_species_dir(u)
         os.chdir(target_dir)
         #   check to see if the file already exists
         if file_funcs.file_exists(lname, self.mainlog):
             #   Get the md5
             lmd5 = file_funcs.calculate_md5(lname, self.mainlog)
             #   Compare the MD5s
             md5s_same = file_funcs.checksum_is_same(lmd5, m, self.mainlog)
             #   If they are the same, skip it, and move on
             if md5s_same:
                 self.mainlog.info(lname + ' is current. Skipping.')
                 continue
             else:
                 self.mainlog.info(lname + ' is out of date. Downloading.')
                 #   Try to download it until the MD5s check out
                 same = False
                 while not same:
                     self.download_file(u)
                     new_lmd5 = file_funcs.calculate_md5(
                         lname,
                         self.mainlog)
                     same = file_funcs.checksum_is_same(
                         new_lmd5,
                         m,
                         self.mainlog)
                 #   Tack it onto the list of files to convert
                 self.to_convert.append(
                     os.path.join(
                         self.base,
                         target_dir,
                         lname)
                     )
         else:
             self.mainlog.info(lname + ' does not yet exist. Downloading.')
             #   And the same procedure as if the file were updated
             same = False
             while not same:
                 self.download_file(u)
                 new_lmd5 = file_funcs.calculate_md5(
                     lname,
                     self.mainlog)
                 same = file_funcs.checksum_is_same(
                     new_lmd5,
                     m,
                     self.mainlog)
             self.to_convert.append(
                 os.path.join(
                     self.base,
                     target_dir,
                     lname))
     self.mainlog.info('Done downloading CDS files from Phytozome.')
     return