def __init__(self, args): registry = Registry() self.args = args[:] if len(self.args) == 0: error("Please provide at least some arguments. See --help") if args[0].lower() not in list(registry.get_converters_names()) \ and "." in args[0]: self.mode = "implicit" # we shoule have at least 2 entries in implicit mode and the first # input filename must exists (1 to 1 or many to 1) if len(args) < 2: error("In implicit mode, you must define your input and output file (only 1 provided)") else: self.mode = "explicit" _log.debug("parsing mode {}".format(self.mode))
def get_output_format(input_value): options = [] try: r = Registry() all_converter = list(r.get_converters_names()) list_format = [] for converter in all_converter: if converter.startswith(input_value): input_format, output_format = converter.split('2', 1) list_format.append(output_format) list_format = list(set(list_format)) list_format.sort() for format in list_format: options.append({'label': format, 'value': format}) return options except TypeError: return options
def get_input_format(): """This function allows to fill the input_dropdown : return: all the input format available in bioconvert : rtype: dict """ r = Registry() all_converter = list(r.get_converters_names()) list_format = [] # We collect input file by splitting the converters for converter in all_converter: input_format, output_format = converter.split('2', 1) list_format.append(input_format) list_format = list(set(list_format)) # to have sorted input format in the dropdown list_format.sort() options = [] # the dropdown option take a dictionnary as argument, so convert the list in dict. Label and value is the same for format in list_format: options.append({'label': format, 'value': format}) return options
class Bioconvert(object): """Universal converter used by the standalone :: from bioconvert import Bioconvert c = Bioconvert("test.fastq", "test.fasta", threads=4, force=True) """ def __init__(self, infile, outfile, force=False, threads=None, extra=None): """.. rubric:: constructor :param str infile: The path of the input file. :param str outfile: The path of The output file :param bool force: overwrite output file if it exists already otherwise raises an error """ # don't check the input file because there are cases where input parameter is just a prefix # if os.path.exists(infile) is False: # msg = "Incorrect input file: %s" % infile # _log.error(msg) # raise ValueError(msg) # check existence of output file. If it exists, # fails except if force argument is set to True if type(outfile) is str: outfile = [outfile] if type(infile) is str: infile = [infile] # some checking on the output files (existence, special case of dsrc) for filename in outfile: if os.path.exists(filename) is True: msg = "output file {} exists already.".format(filename) if force is False: _log.critical( "output file exists. If you are using bioconvert, use --force " ) raise ValueError(msg) else: _log.warning(msg + " --force used so will be over written") # Only fastq files can be compressed with dsrc if filename.endswith(".dsrc"): # only valid for FastQ files extension # dsrc accepts only .fastq file extension if filename.endswith(".fastq.dsrc") is False: msg = "When compressing with .dsrc extension, " +\ "only files ending with .fastq extension are " +\ "accepted. This is due to the way dsrc executable "+\ "is implemented." _log.critical(msg) raise IOError Lin = len(infile) Lout = len(outfile) self.inext = [] self.outext = [] # populate the inext for filename in infile: # example: fastq.gz to fasta.bz2 # Here, we want to decompress, convert, compress. # so we need the extension without .gz or .bz2 # We should have inext set to fastq and outext # set to fasta.bz2 self.inext.append(getext(filename, remove_compression=True)) # populate the outext for filename in outfile: self.outext.append(getext(filename, remove_compression=True)) # special case one to one for compression/decompression # Case 2, fastq.gz to fastq.bz2 # data is not changed, just the type of compression, so we want # to keep the original extensions, here inext and outext will contain # .gz and .bz2 # if 1 to 1 and same extension, we overwrite self.inext and self.outext if Lin == Lout == 1: if self.inext == self.outext: _log.info("decompression/compression mode") self.inext = [getext(infile[0])] self.outext = [getext(outfile[0])] self.mapper = Registry() # From the input parameters 1 and 2, we get the module name if not list( set(list(self.mapper.get_converters_names())).intersection( sys.argv)): # get format from extensions in_fmt = [get_format_from_extension(x) for x in self.inext] out_fmt = [get_format_from_extension(x) for x in self.outext] else: in_fmt, out_fmt = ConvMeta.split_converter_to_format( list( set(list(self.mapper.get_converters_names())).intersection( sys.argv))[0]) self.in_fmt = in_fmt self.out_fmt = out_fmt self.in_fmt = [format.lower() for format in in_fmt] self.in_fmt = tuple(in_fmt) self.out_fmt = [format.lower() for format in out_fmt] self.out_fmt = tuple(out_fmt) _log.info("Input: {}".format(self.in_fmt)) _log.info("Output: {}".format(self.out_fmt)) try: class_converter = self.mapper[(self.in_fmt, self.out_fmt)] self.name = class_converter.__name__ except KeyError: # This module name was not found # Try to find path of converters conv_path = self.mapper.conversion_path(self.in_fmt, self.out_fmt) _log.debug("path: {}".format(conv_path)) if conv_path: _log.info("Direct conversion not implemented. " "Chaining converters.") # implemented in bioconvert/core/base.py # using temporary files class_converter = make_chain([(pair, self.mapper[pair]) for pair in conv_path]) else: msg = "Requested input format ('{}') to output format ('{}') is not available in bioconvert".format( self.in_fmt, self.out_fmt, ) _log.critical(msg) _log.critical( "Use --formats to know the available formats and --help for examples" ) raise Exception(msg) # If --threads provided, we update the threads attribute #FIXME: hack for the compression/decompression decorators if Lin == 1: infile = infile[0] if Lout == 1: outfile = outfile[0] self.converter = class_converter(infile, outfile) if threads is not None: self.converter.threads = threads if extra: self.converter._extra_arguments = extra _log.info("Using {} class (with {} threads if needed)".format( self.converter.name, self.converter.threads)) def __call__(self, *args, **kwargs): self.converter(*args, **kwargs) def boxplot_benchmark(self, *args, **kwargs): self.converter.boxplot_benchmark(*args, **kwargs)
def main(args=None): registry = Registry() if args is None: args = sys.argv[1:] if not len(sys.argv) == 1: # check that the first argument is not a converter in the registry if args[0].lower() not in list(registry.get_converters_names()) \ and "." in args[0]: in_ext = utils.get_extension(args[0], remove_compression=True) out_ext = utils.get_extension(args[1], remove_compression=True) # Check that the input file exists # Fixes https://github.com/bioconvert/bioconvert/issues/204 if os.path.exists(args[0]) is False: _log.error("Input file {} does not exist".format(args[0])) sys.exit(1) # assign to converter the converter (s) found for the ext_pair = (in_ext, out_ext) try: converter = registry.get_ext((in_ext, out_ext)) # for testing the mutiple converter for one extension pair # converter = [bioconvert.fastq2fasta.Fastq2Fasta, bioconvert.phylip2xmfa.PHYLIP2XMFA] except KeyError: converter = [] # if no converter is found if not converter: _log.error( '\n Bioconvert does not support conversion {} -> {}. \n' 'Please specify the converter' '\n Usage : \n\n' '\t bioconvert converter input_file output_file \n ' '\n To see all the converter : ' '\n \t bioconvert --help '.format(in_ext, out_ext)) sys.exit(1) # if the ext_pair matches a single converter elif len(converter) == 1: args.insert(0, converter[0].__name__.lower()) # if the ext_pair matches multiple converters else: _log.error("Ambiguous extension.\n" "You must specify the right conversion Please " "choose a conversion from: \n\n" "{}".format("\n".join( [c.__name__.lower() for c in converter]))) sys.exit(1) # Set the default level bioconvert.logger.level = "ERROR" # Changing the log level before argparse is run try: bioconvert.logger.level = args[args.index("-l") + 1] except: pass try: bioconvert.logger.level = args[args.index("--level") + 1] except: pass try: bioconvert.logger.level = args[args.index("-v") + 1] except: pass try: bioconvert.logger.level = args[args.index("--verbosity") + 1] except: pass allow_indirect_conversion = False try: args.index("--allow-indirect-conversion") allow_indirect_conversion = True except: pass try: args.index("-a") allow_indirect_conversion = True except: pass arg_parser = argparse.ArgumentParser( prog="bioconvert", description="""Convertor infer the formats from the first command. We do not scan the input file. Therefore users must ensure that their input format files are properly formatted.""", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Bioconvert contains tens of converters whose list is available as follows: bioconvert --help Each conversion has its own sub-command and dedicated help. For instance: bioconvert fastq2fasta --help Because the subcommand contains the format, extensions are not important for the conversion itself. This would convert the test.txt file (fastq format) into a fasta file: bioconvert fastq2fasta test.txt test.fasta Users must ensure that their input format files are properly formatted. If there is a conversion from A to B and another for B to C, you can also perform indirect conversion using -a argument (experimental). This command shows all possible indirect conversions: bioconvert --help -a Please visit http://bioconvert.readthedocs.org for more information about the project or formats available. Bioconvert is an open source collaborative project. Please feel free to join us at https://github/biokit/bioconvert """) subparsers = arg_parser.add_subparsers( help='sub-command help', dest='converter', ) max_converter_width = 2 + max( [len(in_fmt) for in_fmt, _, _, _ in registry.iter_converters()]) # show all possible conversion for in_fmt, out_fmt, converter, path in \ sorted(registry.iter_converters(allow_indirect_conversion)): sub_parser_name = "{}2{}".format(in_fmt.lower(), out_fmt.lower()) if converter: link_char = '-' if len(converter.available_methods ) < 1 and converter._library_to_install is None: help_details = " (no available methods please see the doc" \ " for install the necessary libraries) " elif len(converter.available_methods ) < 1 and converter._library_to_install is not None: help_details = " (no available methods please install {} \n" \ "see the doc for more details) ".format(converter._library_to_install) else: help_details = " (%i methods)" % len( converter.available_methods) else: #if path: link_char = '~' if len(path) == 3: help_details = " (w/ 1 intermediate)" else: help_details = " (w/ %i intermediates)" % (len(path) - 2) help_text = '{}to{}> {}{}'.format( (in_fmt + ' ').ljust(max_converter_width, link_char), link_char, out_fmt, help_details, ) sub_parser = subparsers.add_parser( sub_parser_name, help=help_text, formatter_class=argparse.ArgumentDefaultsHelpFormatter, # aliases=["{}_to_{}".format(in_fmt.lower(), out_fmt.lower()), ], epilog="""Bioconvert is an open source collaborative project. Please feel free to join us at https://github/biokit/bioconvert """, ) if converter: converter.add_argument_to_parser(sub_parser=sub_parser) elif path: for a in ConvBase.get_common_arguments(): a.add_to_sub_parser(sub_parser) arg_parser.add_argument( "-v", "--verbosity", default=bioconvert.logger.level, help="Set the outpout verbosity.", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], ) arg_parser.add_argument( "--dependency-report", action="store_true", default=False, help="Output all bioconvert dependencies in json and exit") arg_parser.add_argument("-a", "--allow-indirect-conversion", action="store_true", help="Show all possible indirect conversions " "(labelled as intermediate) (EXPERIMENTAL)") arg_parser.add_argument("--version", action="store_true", default=False, help="Show version") arg_parser.add_argument( "--conversion-graph", nargs="?", default=None, choices=[ "cytoscape", "cytoscape-all", ], ) try: args = arg_parser.parse_args(args) except SystemExit as e: # parsing ask to stop, maybe a normal exit if e.code == 0: raise e # Parsing failed, trying to guess converter from bioconvert.core.levenshtein import wf_levenshtein as lev sub_command = None args_i = 0 while sub_command is None and args_i < len(args): if args[args_i][0] != '-' and ( args_i == 0 or args[args_i - 1] != '-v' and args[args_i - 1] != '--verbose' and args[args_i - 1] != '--conversion-graph'): sub_command = args[args_i] args_i += 1 if sub_command is None: # No sub_command found, so letting the initial exception be risen raise e conversions = [] for in_fmt, out_fmt, converter, path in registry.iter_converters( allow_indirect_conversion): conversion_name = "{}2{}".format(in_fmt.lower(), out_fmt.lower()) conversions.append((lev(conversion_name, sub_command), conversion_name)) matches = sorted(conversions)[:5] if matches[0][0] == 0: # sub_command was ok, problem comes from elswhere raise e arg_parser.exit( e.code, '\n\nYour converter {}() was not found. \n' 'Here is a list of possible matches: {} ... ' '\nYou may also add the -a argument to enfore a ' 'transitive conversion. The whole list is available using\n\n' ' bioconvert --help -a \n'.format( sub_command, ', '.join([v for _, v in matches]))) if args.version: print("{}".format(bioconvert.version)) sys.exit(0) if args.dependency_report: print( json.dumps( get_known_dependencies_with_availability(as_dict=True), sort_keys=True, indent=4, )) sys.exit(0) if args.conversion_graph: if args.conversion_graph.startswith("cytoscape"): all_converter = args.conversion_graph == "cytoscape-all" print( json.dumps( graph.create_graph_for_cytoscape( all_converter=all_converter), indent=4, )) sys.exit(0) if args.converter is None: msg = 'No converter specified. You can list converter by doing bioconvert --help' arg_parser.error(msg) if not (getattr(args, "show_methods", False) or args.input_file): arg_parser.error('Either specify an input_file (<INPUT_FILE>) or ' 'ask for available methods (--show-method)') if not args.allow_indirect_conversion and \ ConvMeta.split_converter_to_format(args.converter) not in registry: arg_parser.error('The conversion {} is not available directly, ' 'you have to accept that we chain converter to do' ' so (--allow-indirect-conversion or -a)'.format( args.converter)) args.raise_exception = args.raise_exception or args.verbosity == "DEBUG" # Set the logging level bioconvert.logger.level = args.verbosity # Figure out whether we have several input files or not # Are we in batch mode ? import glob if args.batch: filenames = glob.glob(args.input_file) else: filenames = [args.input_file] for filename in filenames: args.input_file = filename try: analysis(args) except Exception as e: if args.raise_exception: raise e else: bioconvert.logger.error(e) sys.exit(1)