Пример #1
0
    def _get_from_vfdb_common(self, outprefix, filename, info_text):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'

        try:
            os.mkdir(tmpdir)
        except:
            raise Error('Error mkdir ' + tmpdir)

        zipfile = os.path.join(tmpdir, filename)
        common.download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
        print('Extracting files ... ', end='', flush=True)
        vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
        vparser.run()
        if not self.debug:
            common.rmtree(tmpdir)
        print('done')
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        print('Extracted core DNA sequence dataset and metadata. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
Пример #2
0
    def _get_card_versions(self, tmp_file):
        print('Getting available CARD versions')
        common.download_file('https://card.mcmaster.ca/download',
                             tmp_file,
                             max_attempts=self.max_download_attempts,
                             sleep_time=self.sleep_time,
                             verbose=True)
        p = re.compile(
            r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.(gz|bz2))"'''
        )
        versions = {}

        with open(tmp_file) as f:
            for line in f:
                got = p.findall(line)
                for match in got:
                    key = tuple([int(x) for x in match[1].split('.')])
                    versions[key] = 'https://card.mcmaster.ca' + match[0]

        if len(versions) == 0:
            raise Error('Error getting CARD versions. Cannot continue')

        print('Found versions:')

        for key, url in sorted(versions.items()):
            print('.'.join([str(x) for x in key]), url, sep='\t')

        os.unlink(tmp_file)
        return versions
Пример #3
0
    def _get_from_vfdb_common(self, outprefix, filename, info_text):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'

        try:
            os.mkdir(tmpdir)
        except:
            raise Error('Error mkdir ' + tmpdir)

        zipfile = os.path.join(tmpdir, filename)
        common.download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
        print('Extracting files ... ', end='', flush=True)
        vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
        vparser.run()
        if not self.debug:
            shutil.rmtree(tmpdir)
        print('done')
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        print('Extracted core DNA sequence dataset and metadata. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
Пример #4
0
    def _get_from_argannot(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'arg-annot-database_doc.zip'
        common.download_file(
            'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip',
            zipfile,
            max_attempts=self.max_download_attempts,
            sleep_time=self.sleep_time,
            verbose=True)
        common.syscall('unzip ' + zipfile)
        os.chdir(current_dir)
        print('Extracted files.')

        genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt')
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        seq_reader = pyfastaq.sequences.file_reader(genes_file)
        f_out_tsv = pyfastaq.utils.open_file_write(final_tsv)
        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)

        for seq in seq_reader:
            original_id = seq.id
            seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0])
            print(seq, file=f_out_fa)
            print(seq.id,
                  '1',
                  '0',
                  '.',
                  '.',
                  'Original name: ' + original_id,
                  sep='\t',
                  file=f_out_tsv)

        pyfastaq.utils.close(f_out_tsv)
        pyfastaq.utils.close(f_out_fa)
        if not self.debug:
            common.rmtree(tmpdir)

        print('Finished. Final files are:',
              final_fasta,
              final_tsv,
              sep='\n\t',
              end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv,
              'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(argannot_ref)
Пример #5
0
 def run(self):
     common.download_file(self.zip_url, self.zip_file, verbose=True)
     tmpdir = self.zip_file + '.tmp.extract'
     original_files = MegaresZipParser._extract_files(self.zip_file, tmpdir)
     annotation_data = MegaresZipParser._load_annotations_file(os.path.join(tmpdir, original_files['annotations']))
     header_data = MegaresZipParser._load_header_mappings_file(os.path.join(tmpdir, original_files['header_mappings']))
     sequences = {}
     pyfastaq.tasks.file_to_dict(os.path.join(tmpdir, original_files['fasta']), sequences)
     MegaresZipParser._write_files(self.outprefix, sequences, annotation_data, header_data)
     common.rmtree(tmpdir)
     os.unlink(self.zip_file)
Пример #6
0
    def _get_from_argannot(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'arg-annot-database_doc.zip'
        common.download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
        common.syscall('unzip ' + zipfile)
        os.chdir(current_dir)
        print('Extracted files.')

        genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt')
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        seq_reader = pyfastaq.sequences.file_reader(genes_file)
        f_out_tsv = pyfastaq.utils.open_file_write(final_tsv)
        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)

        for seq in seq_reader:
            original_id = seq.id
            seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0])
            print(seq, file=f_out_fa)
            print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv)


        pyfastaq.utils.close(f_out_tsv)
        pyfastaq.utils.close(f_out_fa)
        if not self.debug:
            shutil.rmtree(tmpdir)

        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(argannot_ref)
Пример #7
0
    def _get_card_versions(self, tmp_file):
        print('Getting available CARD versions')
        common.download_file('https://card.mcmaster.ca/download', tmp_file, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
        p = re.compile(r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.bz2)"''')
        versions = {}

        with open(tmp_file) as f:
            for line in f:
                got = p.findall(line)
                for match in got:
                    key = tuple([int(x) for x in match[1].split('.')])
                    versions[key] = 'https://card.mcmaster.ca' + match[0]

        if len(versions) == 0:
            raise Error('Error getting CARD versions. Cannot continue')

        print('Found versions:')

        for key, url in sorted(versions.items()):
            print('.'.join([str(x) for x in key]), url, sep='\t')

        os.unlink(tmp_file)
        return versions