parser.add_argument("json", help="prepared JSON from augur") parser.add_argument("--metadata", help="tab-delimited file to dump prepared JSON metadata to") args = parser.parse_args() # Setup the logger. logger = logging.getLogger(__name__) # Load the JSON data. with open(args.json, "r") as fh: data = json.load(fh) # Prepare a sequence set. sequences = sequence_set( logger, data["sequences"], data["reference"], data["info"]["date_format"] ) # Write sequences to standard out. output_sequences = sequences.seqs.values() Bio.SeqIO.write(output_sequences, sys.stdout, "fasta") # Prepare metadata if it has been requested. if args.metadata: metadata = [sequences.seqs[seq].attributes for seq in sequences.seqs] metadata_df = pd.DataFrame(metadata) metadata_df = metadata_df.rename(columns={"num_date": "prepared_num_date"}) metadata_df.to_csv(args.metadata, sep="\t", index=False)
def __init__(self, config): """ check config file, make necessary directories, set up logger """ super(process, self).__init__() self.config = combine_configs("process", config) try: assert (os.path.basename(os.getcwd()) == self.config["dir"]) except AssertionError: print("Run this script from within the {} directory".format( self.config["dir"])) sys.exit(2) for p in self.config["output"].values(): if not os.path.isdir(p): os.makedirs(p) self.log = logger(self.config["output"]["data"], False) # parse the JSON into different data bits try: with open(self.config["in"], 'r') as fh: data = json.load(fh) except Exception as e: self.log.fatal("Error loading JSON. Error: {}".format(e)) self.info = data["info"] if "time_interval" in data["info"]: self.info["time_interval"] = [ datetime.strptime(x, '%Y-%m-%d').date() for x in data["info"]["time_interval"] ] self.info["lineage"] = data["info"]["lineage"] if 'leaves' in data: self.tree_leaves = data['leaves'] try: self.colors = data["colors"] except KeyError: self.log.notify("* colours have not been set") self.colors = False try: self.lat_longs = data["lat_longs"] except KeyError: self.log.notify("* latitude & longitudes have not been set") self.lat_longs = False # backwards compatability - set up file_dumps (need to rewrite sometime) # self.sequence_fname = self.input_data_path+'.fasta' self.file_dumps = {} self.output_path = os.path.join(self.config["output"]["data"], self.info["prefix"]) self.file_dumps['seqs'] = self.output_path + '_sequences.pkl.gz' self.file_dumps['tree'] = self.output_path + '_tree.newick' self.file_dumps['nodes'] = self.output_path + '_nodes.pkl.gz' if self.config["clean"] == True: self.log.notify("Removing intermediate files for a clean build") for f in glob.glob(self.output_path + "*"): os.remove(f) if "reference" in data: self.seqs = sequence_set(self.log, data["sequences"], data["reference"], self.info["date_format"]) else: self.log.fatal("No reference provided. Cannot continue.") # self.seqs = sequence_set(self.log, data["sequences"], False, self.info["date_format"]) # backward compatability self.reference_seq = self.seqs.reference_seq self.proteins = self.seqs.proteins for trait in self.info["traits_are_dates"]: self.seqs.convert_trait_to_numerical_date(trait, self.info["date_format"]) # Prepare titers if they are available. if "titers" in data: self.log.debug("Loaded %i titer measurements" % len(data["titers"])) # Convert titer dictionary indices from JSON-compatible strings back # to tuples. self.titers = { eval(key): value for key, value in data["titers"].iteritems() } ## usefull flag to set (from pathogen run file) to disable restoring self.try_to_restore = True