Exemplo n.º 1
0
    parser.add_argument("json", help="prepared JSON from augur")
    parser.add_argument("--metadata", help="tab-delimited file to dump prepared JSON metadata to")

    args = parser.parse_args()

    # Setup the logger.
    logger = logging.getLogger(__name__)

    # Load the JSON data.
    with open(args.json, "r") as fh:
        data = json.load(fh)

    # Prepare a sequence set.
    sequences = sequence_set(
        logger,
        data["sequences"],
        data["reference"],
        data["info"]["date_format"]
    )

    # Write sequences to standard out.
    output_sequences = sequences.seqs.values()
    Bio.SeqIO.write(output_sequences, sys.stdout, "fasta")

    # Prepare metadata if it has been requested.
    if args.metadata:
        metadata = [sequences.seqs[seq].attributes for seq in sequences.seqs]
        metadata_df = pd.DataFrame(metadata)
        metadata_df = metadata_df.rename(columns={"num_date": "prepared_num_date"})
        metadata_df.to_csv(args.metadata, sep="\t", index=False)
Exemplo n.º 2
0
    def __init__(self, config):
        """ check config file, make necessary directories, set up logger """
        super(process, self).__init__()
        self.config = combine_configs("process", config)

        try:
            assert (os.path.basename(os.getcwd()) == self.config["dir"])
        except AssertionError:
            print("Run this script from within the {} directory".format(
                self.config["dir"]))
            sys.exit(2)

        for p in self.config["output"].values():
            if not os.path.isdir(p):
                os.makedirs(p)

        self.log = logger(self.config["output"]["data"], False)

        # parse the JSON into different data bits
        try:
            with open(self.config["in"], 'r') as fh:
                data = json.load(fh)
        except Exception as e:
            self.log.fatal("Error loading JSON. Error: {}".format(e))

        self.info = data["info"]
        if "time_interval" in data["info"]:
            self.info["time_interval"] = [
                datetime.strptime(x, '%Y-%m-%d').date()
                for x in data["info"]["time_interval"]
            ]
        self.info["lineage"] = data["info"]["lineage"]

        if 'leaves' in data:
            self.tree_leaves = data['leaves']

        try:
            self.colors = data["colors"]
        except KeyError:
            self.log.notify("* colours have not been set")
            self.colors = False
        try:
            self.lat_longs = data["lat_longs"]
        except KeyError:
            self.log.notify("* latitude & longitudes have not been set")
            self.lat_longs = False

        # backwards compatability - set up file_dumps (need to rewrite sometime)
        # self.sequence_fname = self.input_data_path+'.fasta'
        self.file_dumps = {}
        self.output_path = os.path.join(self.config["output"]["data"],
                                        self.info["prefix"])
        self.file_dumps['seqs'] = self.output_path + '_sequences.pkl.gz'
        self.file_dumps['tree'] = self.output_path + '_tree.newick'
        self.file_dumps['nodes'] = self.output_path + '_nodes.pkl.gz'

        if self.config["clean"] == True:
            self.log.notify("Removing intermediate files for a clean build")
            for f in glob.glob(self.output_path + "*"):
                os.remove(f)

        if "reference" in data:
            self.seqs = sequence_set(self.log, data["sequences"],
                                     data["reference"],
                                     self.info["date_format"])
        else:
            self.log.fatal("No reference provided. Cannot continue.")
            # self.seqs = sequence_set(self.log, data["sequences"], False, self.info["date_format"])
        # backward compatability
        self.reference_seq = self.seqs.reference_seq
        self.proteins = self.seqs.proteins

        for trait in self.info["traits_are_dates"]:
            self.seqs.convert_trait_to_numerical_date(trait,
                                                      self.info["date_format"])

        # Prepare titers if they are available.
        if "titers" in data:
            self.log.debug("Loaded %i titer measurements" %
                           len(data["titers"]))
            # Convert titer dictionary indices from JSON-compatible strings back
            # to tuples.
            self.titers = {
                eval(key): value
                for key, value in data["titers"].iteritems()
            }

        ## usefull flag to set (from pathogen run file) to disable restoring
        self.try_to_restore = True