def main(args=None): registry = Registry() if args is None: args = sys.argv[1:] if not len(sys.argv) == 1: # check that the first argument is not a converter in the registry if args[0].lower() not in list(registry.get_converters_names()) \ and "." in args[0]: in_ext = utils.get_extension(args[0], remove_compression=True) out_ext = utils.get_extension(args[1], remove_compression=True) # Check that the input file exists # Fixes https://github.com/bioconvert/bioconvert/issues/204 if os.path.exists(args[0]) is False: _log.error("Input file {} does not exist".format(args[0])) sys.exit(1) # assign to converter the converter (s) found for the ext_pair = (in_ext, out_ext) try: converter = registry.get_ext((in_ext, out_ext)) # for testing the mutiple converter for one extension pair # converter = [bioconvert.fastq2fasta.Fastq2Fasta, bioconvert.phylip2xmfa.PHYLIP2XMFA] except KeyError: converter = [] # if no converter is found if not converter: _log.error( '\n Bioconvert does not support conversion {} -> {}. \n' 'Please specify the converter' '\n Usage : \n\n' '\t bioconvert converter input_file output_file \n ' '\n To see all the converter : ' '\n \t bioconvert --help '.format(in_ext, out_ext)) sys.exit(1) # if the ext_pair matches a single converter elif len(converter) == 1: args.insert(0, converter[0].__name__.lower()) # if the ext_pair matches multiple converters else: _log.error("Ambiguous extension.\n" "You must specify the right conversion Please " "choose a conversion from: \n\n" "{}".format("\n".join( [c.__name__.lower() for c in converter]))) sys.exit(1) # Set the default level bioconvert.logger.level = "ERROR" # Changing the log level before argparse is run try: bioconvert.logger.level = args[args.index("-l") + 1] except: pass try: bioconvert.logger.level = args[args.index("--level") + 1] except: pass try: bioconvert.logger.level = args[args.index("-v") + 1] except: pass try: bioconvert.logger.level = args[args.index("--verbosity") + 1] except: pass allow_indirect_conversion = False try: args.index("--allow-indirect-conversion") allow_indirect_conversion = True except: pass try: args.index("-a") allow_indirect_conversion = True except: pass arg_parser = argparse.ArgumentParser( prog="bioconvert", description="""Convertor infer the formats from the first command. We do not scan the input file. Therefore users must ensure that their input format files are properly formatted.""", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Bioconvert contains tens of converters whose list is available as follows: bioconvert --help Each conversion has its own sub-command and dedicated help. For instance: bioconvert fastq2fasta --help Because the subcommand contains the format, extensions are not important for the conversion itself. This would convert the test.txt file (fastq format) into a fasta file: bioconvert fastq2fasta test.txt test.fasta Users must ensure that their input format files are properly formatted. If there is a conversion from A to B and another for B to C, you can also perform indirect conversion using -a argument (experimental). This command shows all possible indirect conversions: bioconvert --help -a Please visit http://bioconvert.readthedocs.org for more information about the project or formats available. Bioconvert is an open source collaborative project. Please feel free to join us at https://github/biokit/bioconvert """) subparsers = arg_parser.add_subparsers( help='sub-command help', dest='converter', ) max_converter_width = 2 + max( [len(in_fmt) for in_fmt, _, _, _ in registry.iter_converters()]) # show all possible conversion for in_fmt, out_fmt, converter, path in \ sorted(registry.iter_converters(allow_indirect_conversion)): sub_parser_name = "{}2{}".format(in_fmt.lower(), out_fmt.lower()) if converter: link_char = '-' if len(converter.available_methods ) < 1 and converter._library_to_install is None: help_details = " (no available methods please see the doc" \ " for install the necessary libraries) " elif len(converter.available_methods ) < 1 and converter._library_to_install is not None: help_details = " (no available methods please install {} \n" \ "see the doc for more details) ".format(converter._library_to_install) else: help_details = " (%i methods)" % len( converter.available_methods) else: #if path: link_char = '~' if len(path) == 3: help_details = " (w/ 1 intermediate)" else: help_details = " (w/ %i intermediates)" % (len(path) - 2) help_text = '{}to{}> {}{}'.format( (in_fmt + ' ').ljust(max_converter_width, link_char), link_char, out_fmt, help_details, ) sub_parser = subparsers.add_parser( sub_parser_name, help=help_text, formatter_class=argparse.ArgumentDefaultsHelpFormatter, # aliases=["{}_to_{}".format(in_fmt.lower(), out_fmt.lower()), ], epilog="""Bioconvert is an open source collaborative project. Please feel free to join us at https://github/biokit/bioconvert """, ) if converter: converter.add_argument_to_parser(sub_parser=sub_parser) elif path: for a in ConvBase.get_common_arguments(): a.add_to_sub_parser(sub_parser) arg_parser.add_argument( "-v", "--verbosity", default=bioconvert.logger.level, help="Set the outpout verbosity.", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], ) arg_parser.add_argument( "--dependency-report", action="store_true", default=False, help="Output all bioconvert dependencies in json and exit") arg_parser.add_argument("-a", "--allow-indirect-conversion", action="store_true", help="Show all possible indirect conversions " "(labelled as intermediate) (EXPERIMENTAL)") arg_parser.add_argument("--version", action="store_true", default=False, help="Show version") arg_parser.add_argument( "--conversion-graph", nargs="?", default=None, choices=[ "cytoscape", "cytoscape-all", ], ) try: args = arg_parser.parse_args(args) except SystemExit as e: # parsing ask to stop, maybe a normal exit if e.code == 0: raise e # Parsing failed, trying to guess converter from bioconvert.core.levenshtein import wf_levenshtein as lev sub_command = None args_i = 0 while sub_command is None and args_i < len(args): if args[args_i][0] != '-' and ( args_i == 0 or args[args_i - 1] != '-v' and args[args_i - 1] != '--verbose' and args[args_i - 1] != '--conversion-graph'): sub_command = args[args_i] args_i += 1 if sub_command is None: # No sub_command found, so letting the initial exception be risen raise e conversions = [] for in_fmt, out_fmt, converter, path in registry.iter_converters( allow_indirect_conversion): conversion_name = "{}2{}".format(in_fmt.lower(), out_fmt.lower()) conversions.append((lev(conversion_name, sub_command), conversion_name)) matches = sorted(conversions)[:5] if matches[0][0] == 0: # sub_command was ok, problem comes from elswhere raise e arg_parser.exit( e.code, '\n\nYour converter {}() was not found. \n' 'Here is a list of possible matches: {} ... ' '\nYou may also add the -a argument to enfore a ' 'transitive conversion. The whole list is available using\n\n' ' bioconvert --help -a \n'.format( sub_command, ', '.join([v for _, v in matches]))) if args.version: print("{}".format(bioconvert.version)) sys.exit(0) if args.dependency_report: print( json.dumps( get_known_dependencies_with_availability(as_dict=True), sort_keys=True, indent=4, )) sys.exit(0) if args.conversion_graph: if args.conversion_graph.startswith("cytoscape"): all_converter = args.conversion_graph == "cytoscape-all" print( json.dumps( graph.create_graph_for_cytoscape( all_converter=all_converter), indent=4, )) sys.exit(0) if args.converter is None: msg = 'No converter specified. You can list converter by doing bioconvert --help' arg_parser.error(msg) if not (getattr(args, "show_methods", False) or args.input_file): arg_parser.error('Either specify an input_file (<INPUT_FILE>) or ' 'ask for available methods (--show-method)') if not args.allow_indirect_conversion and \ ConvMeta.split_converter_to_format(args.converter) not in registry: arg_parser.error('The conversion {} is not available directly, ' 'you have to accept that we chain converter to do' ' so (--allow-indirect-conversion or -a)'.format( args.converter)) args.raise_exception = args.raise_exception or args.verbosity == "DEBUG" # Set the logging level bioconvert.logger.level = args.verbosity # Figure out whether we have several input files or not # Are we in batch mode ? import glob if args.batch: filenames = glob.glob(args.input_file) else: filenames = [args.input_file] for filename in filenames: args.input_file = filename try: analysis(args) except Exception as e: if args.raise_exception: raise e else: bioconvert.logger.error(e) sys.exit(1)
def __init__(self, infile, outfile, in_fmt=None, out_fmt=None, force=False): """.. rubric:: constructor :param str infile: The path of the input file. :param str outfile: The path of The output file :param str in_fmt: the format for the input file :param str out_fmt: the format for the output :param bool force: overwrite output file if it exists already otherwise raises an error """ # don't check the input file because there are cases where input parameter is just a prefix # if os.path.exists(infile) is False: # msg = "Incorrect input file: %s" % infile # _log.error(msg) # raise ValueError(msg) # check existence of output file. If it exists, # fails except if force argument is set to True if os.path.exists(outfile) is True: msg = "output file {} exists already".format(outfile) _log.warning("output file exists already") if force is False: _log.critical("output file exists. If you are using bioconvert, use --force ") raise ValueError(msg) else: _log.warning("output file will be overwritten") # Only fastq files can be compressed with dsrc if outfile.endswith(".dsrc"): # only valid for FastQ files extension # dsrc accepts only .fastq file extension if outfile.endswith(".fastq.dsrc") is False: msg = "When compressing with .dsrc extension, " +\ "only files ending with .fastq extension are " +\ "accepted. This is due to the way dsrc executable +"\ "is implemented." _log.critical(msg) raise IOError # Case1: fastq.gz to fasta.bz2 # Here, we want to decompress, convert, compress. # so we need the extension without .gz or .bz2 # We should have inext set to fastq and outext # set to fasta.bz2 self.inext = getext(infile, remove_compression=True) self.outext = getext(outfile, remove_compression=True) # Case 2, fastq.gz to fastq.bz2 # data is not changed, just the type of compression, so we want # to keep the original extensions, here inext and outext will contain # .gz and .bz2 if self.inext == self.outext: _log.info("decompression/compression mode") self.inext = getext(infile) self.outext = getext(outfile) self.mapper = Registry() # From the input parameters 1 and 2, we get the module name try: if in_fmt is None: in_fmt = get_format_from_extension(self.inext) if out_fmt is None: out_fmt = get_format_from_extension(self.outext) self.in_fmt = in_fmt.upper() self.out_fmt = out_fmt.upper() _log.info("Input: %s", self.in_fmt) _log.info("Output: %s", self.out_fmt) class_converter = self.mapper[(self.in_fmt, self.out_fmt)] self.name = class_converter.__name__ except KeyError: # This module name was not found # Try to find path of converters conv_path = self.mapper.conversion_path(self.in_fmt, self.out_fmt) _log.debug("path: {}".format(conv_path)) if conv_path: _log.info("Direct conversion not implemented. " "Chaining converters.") # implemented in bioconvert/core/base.py # using temporary files class_converter = make_chain([ (pair, self.mapper[pair]) for pair in conv_path]) else: msg = "Requested input format ('%s') to output format ('%s') is not available in bioconvert" %( self.in_fmt, self.out_fmt, ) _log.critical(msg) _log.critical("Use --formats to know the available formats and --help for examples") raise Exception(msg) self.converter = class_converter(infile, outfile) _log.info("Using {} class".format(self.converter.name))
def analysis(args): in_fmt, out_fmt = ConvMeta.split_converter_to_format(args.converter) # do we want to know the available methods ? If so, print info and quit if getattr(args, "show_methods", False): class_converter = Registry()[(in_fmt, out_fmt)] print("Methods available: {}".format( class_converter.available_methods)) print("\nPlease see http://bioconvert.readthedocs.io/en/master/" "references.html#{} for details ".format( str(class_converter).split("'")[1])) if args.raise_exception: return sys.exit(0) # Input and output filename infile = args.input_file # Check that the input file exists # Fixes https://github.com/bioconvert/bioconvert/issues/204 if os.path.exists(infile) is False: # Some convertors uses prefix instead of filename. We could have # ambiguities: if we use a prefix without extension, # we could be confused with the convertor name. This is true # for the plink families if "plink" in args.converter: pass else: _log.error( "Input file {} does not exist (analysis)".format(infile)) sys.exit(1) if args.output_file is None and infile: outext = ConvMeta.split_converter_to_format(args.converter) outfile = infile.rsplit(".", 1)[0] + "." + outext[1].lower() else: outfile = args.output_file # Call a generic wrapper of all available conversion conv = Bioconvert( infile, outfile, in_fmt=in_fmt, out_fmt=out_fmt, force=args.force, ) # # Users may provide information about the input file. # # Indeed, the input may be a FastQ file but with an extension # # that is not standard. For instance fq instead of fastq # # If so, we can use the --input-format fastq to overwrite the # # provided filename extension # no need to do this # if args.input_format: # inext = args.input_format # if not conv.inext.startswith("."): # conv.inext = "." + inext if not conv.in_fmt: raise RuntimeError("convert infer the format from the extension name." " So add extension to the input file name or use" " --input-format option.") if not conv.out_fmt: raise RuntimeError("convert infer the format from the extension name." " So add extension to the output file name or use" " --output-format option.") bioconvert.logger.info("Converting from {} to {}".format( conv.in_fmt, conv.out_fmt)) # params = {"threads": args.threads} if args.benchmark: conv.boxplot_benchmark(N=args.benchmark_N) import pylab try: outpng = "benchmark_{}.png".format(conv.name) pylab.savefig(outpng, dpi=200) except: outpng = "benchmark_{}.png".format(conv.converter.name) pylab.savefig(outpng, dpi=200) bioconvert.logger.info("File {} created") else: # params["method"] = args.method conv(**vars(args))
# along with this program (COPYING file). # # If not, see <http://www.gnu.org/licenses/>. # ########################################################################### """ Available methods per converter ===================================== Plot number of implemented methods per converter. """ ################################################# # from bioconvert.core.registry import Registry r = Registry() info = r.get_info() # The available unique converters converters = [x for x in info.items()] # the number of methods per converter data = [info[k] for k, v in info.items()] print("Number of converters: {}".format(len(converters))) print("Number of methods : {}".format(sum(data))) ##################################################### from pylab import hist, clf, xlabel, grid clf()
def test_rgistry(): rr = Registry() rr.info() print(rr)
def create_graph(filename, layout="dot", use_singularity=False, color_for_disabled_converter='red'): """ :param filename: should end in .png or .svg or .dot If extension is .dot, only the dot file is created. This is useful if you have issues installing graphviz. If so, under Linux you could use our singularity container see github.com/cokelaer/graphviz4all """ from bioconvert.core.registry import Registry rr = Registry() try: if filename.endswith(".dot") or use_singularity is True: raise Exception() from pygraphviz import AGraph dg = AGraph(directed=True) for a, b, s in rr.get_all_conversions(): dg.add_edge(a, b, color='black' if s else color_for_disabled_converter) dg.layout(layout) dg.draw(filename) except Exception as e: _log.error(e) dot = """ strict digraph{ node [label="\\N"]; """ nodes = set([item for items in rr.get_all_conversions() for item in items[0:1]]) for node in nodes: dot += "\"{}\";\n".format(node) for a, b, s in rr.get_all_conversions(): dot += "\"{}\" -> \"{}\";\n".format(a, b) dot += "}\n" from easydev import TempFile from bioconvert import shell dotfile = TempFile(suffix=".dot") with open(dotfile.name, "w") as fout: fout.write(dot) dotpath = "" if use_singularity: from bioconvert.core.downloader import download_singularity_image singfile = download_singularity_image( "graphviz.simg", "shub://cokelaer/graphviz4all:v1", "4288088d91c848e5e3a327282a1ab3d1") dotpath = "singularity run {} ".format(singfile) on_rtd = environ.get('READTHEDOCS', None) == 'True' if on_rtd: dotpath = "" ext = filename.rsplit(".", 1)[1] cmd = "{}dot -T{} {} -o {}".format(dotpath, ext, dotfile.name, filename) try: shell(cmd) except: import os os.system(cmd)
def main(args=None): # used later on registry = Registry() if args is None: args = sys.argv[1:] # convenient variable to check implicit/explicit mode and # get information about the arguments. ph = ParserHelper(args) if not len(sys.argv) == 1: if ph.mode == "implicit": # Check that the input file exists # Fixes https://github.com/bioconvert/bioconvert/issues/204 if os.path.exists(args[0]) is False: _log.error("First input file {} does not exist".format( args[0])) sys.exit(1) # list of filenames from which we get the extensions filenames = ph.get_filelist() exts = [ utils.get_extension(x, remove_compression=True) for x in filenames ] # We need to get the corresponding converter if any. # We assume that the input formats are ordered alphabetically # (bioconvert API). # For instance fasta,qual to fastq can be # found but qual,fasta to fastq cannot. Indeed, in more complex # cases such as a,b -> c,d we cannot know whether there are 1 or 3 # inputs. This would require extra code here below try: L = len(exts) converter = [] # if input is a,b,c,d we want to try a->(b,c,d) and # (a,b)->(c,d) and (a,b,c)-> c so L-1 case for i in range(1, L): in_ext = tuple(exts[0:i]) out_ext = tuple(exts[i:]) try: converter.extend(registry.get_ext((in_ext, out_ext))) except KeyError: pass except KeyError: converter = [] # For 1-to-1, if the extensions are identical but different # compression, this means we just want to decompress and # re-compress in another format. if not converter and (exts[0] == exts[1]): exts_with_comp = [ utils.get_extension(x, remove_compression=False) for x in filenames ] in_ext, out_ext = exts_with_comp[0], exts_with_comp[1] comps = ['gz', 'dsrc', 'bz2'] if in_ext in comps and out_ext in comps: converter.extend( registry.get_ext(((in_ext, ), (out_ext, )))) # if no converter is found, print information if not converter: msg = '\nBioconvert does not support conversion {} -> {}. \n\n' msg = msg.format(in_ext, out_ext) # maybe it is an indirect conversion ? let us look at the # digraph try: _path = registry._path_dict_ext[in_ext][out_ext] #Here, we have a transitive list of tuples to go from A to C # example from fq to clustal returns: # [('fq',), ('fa',), ('clustal',)] # If we naively build the converter from those names # (fq2clustal), this is a non official converter name. The # official one is fastq2clustal, so we need some hack here: in_name, int_name, out_name = _path a = registry._ext_registry[ in_name, int_name][0].__name__.split("2")[0] b = registry._ext_registry[ int_name, out_name][0].__name__.split("2")[1] convname = "2".join([a, b]).lower() msg += "\n".join( textwrap.wrap( "Note, however, that an indirect conversion through" " an intermediate format is possible for your input and " " output format. To do so, you need to use the -a option " " and be explicit about the type of conversion. To get " " the list of possible direct and indirect conversion, " " please use:\n\n")) msg += "\n\n bioconvert --help -a\n\n" msg += "For help and with your input/output most probably" msg += "the command should be: \n\n bioconvert {} {} -a\n\n ".format( convname, " ".join(ph.get_filelist())) except KeyError: pass # not converter found in the path error(msg) # if the ext_pair matches a single converter elif len(converter) == 1: args.insert(0, converter[0].__name__.lower()) # if the ext_pair matches multiple converters else: _log.error("Ambiguous extension.\n" "You must specify the right conversion Please " "choose a conversion from: \n\n" "{}".format("\n".join( [c.__name__.lower() for c in converter]))) sys.exit(1) # Set the default level bioconvert.logger.level = "ERROR" # Changing the log level before argparse is run try: bioconvert.logger.level = args[args.index("-l") + 1] except: pass try: bioconvert.logger.level = args[args.index("--level") + 1] except: pass try: bioconvert.logger.level = args[args.index("-v") + 1] except: pass try: bioconvert.logger.level = args[args.index("--verbosity") + 1] except: pass # if there is the ability to convert from A to B to C, we must set # the option -a (--allow_indirect_conversion) allow_indirect_conversion = False try: args.index("--allow-indirect-conversion") allow_indirect_conversion = True except: pass try: args.index("-a") allow_indirect_conversion = True except: pass # Now, the instanciation of the main bioconvert user interface arg_parser = argparse.ArgumentParser( prog="bioconvert", description="", #""Convertor infer the #formats from the first command. We do #not scan the input file. Therefore #users must ensure that their input #format files are properly #formatted.""", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Bioconvert contains tens of converters whose list is available as follows: bioconvert --help Each conversion has its own sub-command and dedicated help. For instance: bioconvert fastq2fasta --help Because the subcommand contains the format, extensions are not important for the conversion itself. This would convert the test.txt file (fastq format) into a fasta file: bioconvert fastq2fasta test.txt test.fasta If you use known extensions, the converter may be omitted:: bioconvert test.fastq test.fasta Users must ensure that their input format files are properly formatted. If there is a conversion from A to B and another for B to C, you can also perform indirect conversion using -a argument (experimental). This command shows all possible indirect conversions: bioconvert --help -a Please visit http://bioconvert.readthedocs.org for more information about the project or formats available. Would you wish to help, please join our open source collaborative project at https://github/bioconvert/bioconvert """) subparsers = arg_parser.add_subparsers( help='sub-command help', dest='converter', ) max_converter_width = 2 + max( [len(in_fmt) for in_fmt, _, _, _ in registry.iter_converters()]) def sorting_tuple_string(item): if type(item) is tuple: return item[0][0] if type(item) is str: return item[0] # show all possible conversion including indirect conversion for in_fmt, out_fmt, converter, path in \ sorted(registry.iter_converters(allow_indirect_conversion), key=sorting_tuple_string): in_fmt = ConvBase.lower_tuple(in_fmt) in_fmt = ["_".join(in_fmt)] out_fmt = ConvBase.lower_tuple(out_fmt) out_fmt = ["_".join(out_fmt)] sub_parser_name = "{}2{}".format("_".join(in_fmt), "_".join(out_fmt)) if converter: link_char = '-' if len(converter.available_methods) < 1: help_details = " (no available methods please see the doc" \ " for install the necessary libraries) " else: help_details = " (%i methods)" % len( converter.available_methods) else: #if path: link_char = '~' if len(path) == 3: help_details = " (w/ 1 intermediate)" else: help_details = " (w/ %i intermediates)" % (len(path) - 2) help_text = '{}to{}> {}{}'.format( ("_".join(in_fmt) + ' ').ljust(max_converter_width, link_char), link_char, ("_".join(out_fmt)), help_details, ) sub_parser = subparsers.add_parser( sub_parser_name, help=help_text, formatter_class=argparse.ArgumentDefaultsHelpFormatter, # aliases=["{}_to_{}".format(in_fmt.lower(), out_fmt.lower()), ], epilog="""Bioconvert is an open source collaborative project. Please feel free to join us at https://github/biokit/bioconvert """, ) if converter: converter.add_argument_to_parser(sub_parser=sub_parser) elif path: for a in ConvBase.get_IO_arguments(): a.add_to_sub_parser(sub_parser) for a in ConvBase.get_common_arguments(): a.add_to_sub_parser(sub_parser) # arguments when no explicit conversion provided. arg_parser.add_argument( "-v", "--verbosity", default=bioconvert.logger.level, help="Set the outpout verbosity.", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], ) arg_parser.add_argument( "-l", "--level", default=bioconvert.logger.level, help="Set the outpout verbosity. Same as --verbosity", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], ) arg_parser.add_argument( "--dependency-report", action="store_true", default=False, help="Output all bioconvert dependencies in json and exit") arg_parser.add_argument("-a", "--allow-indirect-conversion", action="store_true", help="Show all possible indirect conversions " "(labelled as intermediate)") arg_parser.add_argument("--version", action="store_true", default=False, help="Show version") arg_parser.add_argument( "--conversion-graph", nargs="?", default=None, choices=[ "cytoscape", "cytoscape-all", ], ) try: args = arg_parser.parse_args(args) except SystemExit as e: # parsing ask to stop, maybe a normal exit if e.code == 0: raise e # Parsing failed, trying to guess converter from bioconvert.core.levenshtein import wf_levenshtein as lev sub_command = None args_i = 0 while sub_command is None and args_i < len(args): if args[args_i][0] != '-' and ( args_i == 0 or args[args_i - 1] != '-v' and args[args_i - 1] != '--verbose' and args[args_i - 1] != '--conversion-graph'): sub_command = args[args_i] args_i += 1 if sub_command is None: # No sub_command found, so letting the initial exception be risen raise e conversions = [] for in_fmt, out_fmt, converter, path in registry.iter_converters( allow_indirect_conversion): in_fmt = ConvBase.lower_tuple(in_fmt) in_fmt = ["_".join(in_fmt)] out_fmt = ConvBase.lower_tuple(out_fmt) out_fmt = ["_".join(out_fmt)] conversion_name = "{}2{}".format("_".join(in_fmt), "_".join(out_fmt)) conversions.append((lev(conversion_name, sub_command), conversion_name)) matches = sorted(conversions)[:5] if matches[0][0] == 0: # sub_command was ok, problem comes from elswhere raise e arg_parser.exit( e.code, '\n\nYour converter {}() was not found. \n' 'Here is a list of possible matches: {} ... ' '\nYou may also add the -a argument to enfore a ' 'transitive conversion. The whole list is available using\n\n' ' bioconvert --help -a \n'.format( sub_command, ', '.join([v for _, v in matches]))) if args.version: print("{}".format(bioconvert.version)) sys.exit(0) if args.dependency_report: print( json.dumps( get_known_dependencies_with_availability(as_dict=True), sort_keys=True, indent=4, )) sys.exit(0) if args.conversion_graph: if args.conversion_graph.startswith("cytoscape"): all_converter = args.conversion_graph == "cytoscape-all" print( json.dumps( graph.create_graph_for_cytoscape( all_converter=all_converter), indent=4, )) sys.exit(0) if args.converter is None: msg = "No converter specified. " msg += "You can list all converters by using:\n\n\tbioconvert --help" arg_parser.error(msg) if not (getattr(args, "show_methods", False) or args.input_file): arg_parser.error('Either specify an input_file (<INPUT_FILE>) or ' 'ask for available methods (--show-method)') if not args.allow_indirect_conversion and \ ConvMeta.split_converter_to_format(args.converter) not in registry: arg_parser.error('The conversion {} is not available directly, ' 'you have to accept that we chain converter to do' ' so (--allow-indirect-conversion or -a)'.format( args.converter)) args.raise_exception = args.raise_exception or args.verbosity == "DEBUG" # Set the logging level bioconvert.logger.level = args.verbosity # Figure out whether we have several input files or not # Are we in batch mode ? if args.batch: filenames = glob.glob(args.input_file) else: filenames = [args.input_file] N = len(filenames) for i, filename in enumerate(filenames): if N > 1: _log.info("Converting {} ({}/{})".format(filename, i + 1, N)) args.input_file = filename try: analysis(args) except Exception as e: if args.raise_exception: raise e else: bioconvert.logger.error(e) sys.exit(1)
def analysis(args): in_fmt, out_fmt = ConvMeta.split_converter_to_format(args.converter) # do we want to know the available methods ? If so, print info and quit if getattr(args, "show_methods", False): class_converter = Registry()[(in_fmt, out_fmt)] print("Methods available: {}".format( class_converter.available_methods)) print("\nPlease see http://bioconvert.readthedocs.io/en/master/" "references.html#{} for details ".format( str(class_converter).split("'")[1])) if args.raise_exception: return sys.exit(0) # Input and output filename infile = args.input_file # Check that the input file exists # Fixes https://github.com/bioconvert/bioconvert/issues/204 if type(infile) is tuple: for file in infile: if os.path.exists(file) is False: # Some convertors uses prefix instead of filename. We could have # ambiguities: if we use a prefix without extension, # we could be confused with the convertor name. This is true # for the plink families if "plink" in args.converter: pass else: _log.error( "Input file {} does not exist (analysis)".format(file)) sys.exit(1) if args.output_file is None and infile: outext = ConvMeta.split_converter_to_format(args.converter) if infile.split(".")[-1] in ["gz", "dsrc", "bz2"]: outfile = infile.split(".", 1)[0].split(".", 1)[0] outfile += "." + outext[1][0].lower() else: outfile = infile.rsplit(".", 1)[0] + "." + outext[1][0].lower() print(outext, outfile) else: outfile = args.output_file # check whether a valid --thread option was provided if "threads" in args: threads = args.threads else: threads = None # default will be "" if "extra_arguments" in args: extra_arguments = args.extra_arguments # Call a generic wrapper of all available conversion conv = Bioconvert( infile, outfile, #in_fmt=in_fmt, #out_fmt=out_fmt, force=args.force, threads=threads, extra=extra_arguments) if args.benchmark: conv.boxplot_benchmark(N=args.benchmark_N, to_include=args.benchmark_methods) print(args.benchmark_methods) import pylab try: outpng = "benchmark_{}.png".format(conv.name) pylab.savefig(outpng, dpi=200) except: outpng = "benchmark_{}.png".format(conv.converter.name) pylab.savefig(outpng, dpi=200) bioconvert.logger.info("File {} created") else: # params["method"] = args.method conv(**vars(args))
def __init__(self, infile, outfile, force=False): """.. rubric:: constructor :param str infile: The path of the input file. :param str outfile: The path of The output file :param bool force: overwrite output file if it exists already otherwise raises an error """ if os.path.exists(infile) is False: msg = "Incorrect input file: %s" % infile _log.error(msg) raise ValueError(msg) # check existence of output file. If it exists, # fails except if force argument is set to True if os.path.exists(outfile) is True: msg = "output file {} exists already".format(outfile) _log.warning("output file exists already") if force is False: _log.critical( "output file exists. If you are using bioconvert, use --force " ) raise ValueError(msg) else: _log.warning("output file will be overwritten") # Only fastq files can be compressed with dsrc if outfile.endswith(".dsrc"): # only valid for FastQ files extension # dsrc accepts only .fastq file extension if outfile.endswith(".fastq.dsrc") is False: msg = "When compressing with .dsrc extension, " +\ "only files ending with .fastq extension are " +\ "accepted. This is due to the way dsrc executable +"\ "is implemented." _log.critical(msg) raise IOError # case1: fastq.gz to fasta.bz2 # Here, we want to decompress, convert, compress. # so we need the extension without .gz or .bz2 # We should have inext set to fastq and outext # set to fasta.bz2 self.inext = getext(infile, remove_compression=True) self.outext = getext(outfile, remove_compression=True) # Case 2, fastq.gz to fastq.bz2 # data is not changed, just the type of compression, so we want # to keep the original extensions, here inext and outext will contain # .gz and .bz2 if self.inext == self.outext: _log.info("decompression/compression mode") self.inext = getext(infile) self.outext = getext(outfile) self.mapper = Registry() # From the input parameters 1 and 2, we get the module name try: _log.info("Input: {}".format(self.inext)) _log.info("Output: {}".format(self.outext)) class_converter = self.mapper[(self.inext, self.outext)] self.name = class_converter.__name__ except KeyError: # This module name was not found msg = "Requested input format ({}) to output format ({}) is not available in bioconvert" _log.critical(msg.format(self.inext, self.outext)) _log.critical( "Use --formats to know the available formats and --help for examples" ) sys.exit(1) self.converter = class_converter(infile, outfile) _log.info("Using {} class".format(self.converter.name))
def create_graph(filename, layout="dot", use_singularity=False, color_for_disabled_converter='red'): """ :param filename: should end in .png or .svg or .dot If extension is .dot, only the dot file is created. This is useful if you have issues installing graphviz. If so, under Linux you could use our singularity container see github.com/cokelaer/graphviz4all """ from bioconvert.core.registry import Registry rr = Registry() try: if filename.endswith(".dot") or use_singularity is True: raise Exception() from pygraphviz import AGraph dg = AGraph(directed=True) url = "https://bioconvert.readthedocs.io/en/master/formats.html#{}" for a, b, s in rr.get_all_conversions(): if len(a) == 1 and len(b) == 1: dg.add_node(a[0], shape="rectangle", style="filled", url=url.format(a[0].upper())) dg.add_node(b[0], shape="rectangle", style="filled", url=url.format(b[0].upper())) dg.add_edge( a[0], b[0], color='black' if s else color_for_disabled_converter) else: and_node = "_".join(a) + "_and_" + "_".join(b) dg.add_node(and_node, label="", fillcolor="black", width=.1, height=.1, styled="filled", fixedsize=True, shape="circle") for this in a: dg.add_edge( this, and_node, color="black" if s else color_for_disabled_converter) for this in b: dg.add_edge( and_node, this, color="black" if s else color_for_disabled_converter) for name in dg.nodes(): if dg.degree(name) < 5: dg.get_node(name).attr["fillcolor"] = "white" elif dg.degree(name) < 10: # yellow dg.get_node(name).attr["fillcolor"] = "yellow" elif dg.degree(name) < 20: # orange dg.get_node(name).attr["fillcolor"] = "orange" else: # red dg.get_node(name).attr["fillcolor"] = "red" dg.layout(layout) dg.draw(filename) dg.write("conversion.dot") print(list(dg.get_node("FASTQ").attr.values())) except Exception as e: _log.error(e) dot = """ strict digraph{ node [label="\\N"]; """ nodes = set([ item for items in rr.get_all_conversions() for item in items[0:1] ]) for node in nodes: dot += "\"{}\";\n".format(node) for a, b, s in rr.get_all_conversions(): dot += "\"{}\" -> \"{}\";\n".format(a, b) dot += "}\n" from easydev import TempFile from bioconvert import shell dotfile = TempFile(suffix=".dot") with open(dotfile.name, "w") as fout: fout.write(dot) dotpath = "" if use_singularity: from bioconvert.core.downloader import download_singularity_image singfile = download_singularity_image( "graphviz.simg", "shub://cokelaer/graphviz4all:v1", "4288088d91c848e5e3a327282a1ab3d1") dotpath = "singularity run {} ".format(singfile) on_rtd = environ.get('READTHEDOCS', None) == 'True' if on_rtd: dotpath = "" ext = filename.rsplit(".", 1)[1] cmd = "{}dot -T{} {} -o {}".format(dotpath, ext, dotfile.name, filename) print(dotfile.name) try: shell(cmd) except: import os os.system(cmd)