Пример #1
0
    def __init__(self, **kwargs):
        '''
        Congruent with other nextstrain builds, dengue_process is a catch-all class
        that initially holds the input data paths and params arguments.
        '''
        super(process, self).__init__()

        ##### Handle serotype-specific file input/output. #####
        self.serotype = kwargs['serotype']
        self.lineage = 'dengue_%s' % self.serotype
        if self.serotype == 'all':  # For all-serotype build, use dengue 4 outgroup and look for files like dengue.fasta
            self.reference_fname = './dengue/metadata/dengue_denv4_outgroup.gb'
            newest_sequence_file = sorted(
                glob('../fauna/data/%s.fasta' % self.lineage),
                key=lambda f: os.path.getmtime(f))[-1]
        else:
            self.reference_fname = './dengue/metadata/%s_outgroup.gb' % self.lineage
            try:  # Look for a serotype-specific fasta
                newest_sequence_file = sorted(
                    glob('../fauna/data/%s*.fasta' % self.lineage),
                    key=lambda f: os.path.getmtime(f))[-1]
            except:  # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior)
                newest_sequence_file = select_serotype(
                    '../fauna/data/dengue_all.fasta', '../fauna/data/',
                    self.serotype)
                print(
                    'WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'
                    % (self.serotype, '../fauna/data/dengue.fasta',
                       newest_sequence_file))

        self.input_data_path = newest_sequence_file.split('.fasta')[0]
        self.sequence_fname = newest_sequence_file
        self.store_data_path = 'store/' + self.lineage + '_'
        self.build_data_path = 'build/' + self.lineage + '_'
        self.proteins = [
            'C', 'M', 'E', 'NS1', 'NS2A', 'NS2B', 'NS3', 'NS4A', '2K', 'NS4B',
            'NS5'
        ]

        ##### Initialize process object #####
        self.dengue = process(
            input_data_path=self.input_data_path,
            store_data_path=self.store_data_path,
            build_data_path=self.build_data_path,
            proteins=self.proteins,
            reference=self.reference_fname,
            method='SLSQP',
            lat_long_fname='../fauna/source-data/geo_lat_long.tsv')
Пример #2
0
        "dir": "mumps",
        "in": prepared_json,
        "newick_tree_options": {
            "nthreads": 4
        },
        "clock_filter": {
            "n_iqd": 4,
        },
        "geo_inference": geo_inference,
        "auspice": {  ## settings for auspice JSON export
            "color_options": color_options,
            "defaults": defaults
        },
        "clean": clean
    }


if __name__ == "__main__":
    params = parser.parse_args()
    jsons = glob.glob(
        "prepared/*.json") if "all" in params.jsons else params.jsons
    for prepared_json in jsons:
        print("Processing {}".format(prepared_json))
        runner = process(make_config(prepared_json, params.clean, params))
        runner.align()
        runner.build_tree()
        runner.timetree_setup_filter_run()
        runner.run_geo_inference()
        runner.save_as_nexus()
        runner.auspice_export()
Пример #3
0
    "newick_tree_options": {"nthreads": 4},
    "clock_filter": {
        "n_iqd": 4,
    },
    "geo_inference": ['country', 'region'], # what traits to perform this on
    "geo_inference_options": {
        "root_state": {
            "region": "southeast_asia",
            "country": "vietnam",
        },
    },
    "auspice": { ## settings for auspice JSON export
        "color_options": {
            "country":{"key":"country", "legendTitle":"Country", "menuItem":"country", "type":"discrete"},
            "region":{"key":"region", "legendTitle":"Region", "menuItem":"region", "type":"discrete"},
        },
        "controls": {'authors':['authors']}
    }
}

if __name__=="__main__":
    params = parser.parse_args()
    if params.clean: config["clean"] = True
    runner = process(config)
    runner.align()
    runner.build_tree()
    runner.timetree_setup_filter_run()
    runner.run_geo_inference()
    runner.save_as_nexus()
    runner.auspice_export()
Пример #4
0
    def __init__(self, **kwargs):
        super(process, self).__init__()

        self.serotype = kwargs['serotype']
        if self.serotype == 'any':  # For any-serotype build, use dengue 3 outgroup and look for files like dengue.fasta
            self.lineage = 'dengue'
            self.reference_fname = './dengue/metadata/dengue_%s_outgroup.gb' % '3'
            newest_sequence_file = sorted(
                glob('../fauna/data/%s.fasta' % self.lineage),
                key=lambda f: os.path.getmtime(f))[-1]
        else:
            self.lineage = 'dengue_%s' % self.serotype  # For serotype-specific build, use the corresponding outgroup
            self.reference_fname = './dengue/metadata/dengue_%s_outgroup.gb' % self.serotype
            try:  # Look for a serotype-specific fasta
                newest_sequence_file = sorted(
                    glob('../fauna/data/%s*.fasta' % self.lineage),
                    key=lambda f: os.path.getmtime(f))[-1]
            except:  # If it doesn't exist, try to pull serotype-specific sequences out of the any-serotype fasta (warn the user of this behavior)
                newest_sequence_file = select_serotype(
                    '../fauna/data/dengue.fasta', '../fauna/data/',
                    self.serotype)
                print(
                    'WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from any-serotype fasta file %s\nWrote these to file %s'
                    % (self.serotype, '../fauna/data/dengue.fasta',
                       newest_sequence_file))

        self.input_data_path = newest_sequence_file.split('.fasta')[0]
        self.sequence_fname = newest_sequence_file
        self.store_data_path = 'store/' + self.lineage + '_'
        self.build_data_path = 'build/' + self.lineage + '_'
        self.proteins = [
            'C', 'M', 'E', 'NS1', 'NS2A', 'NS2B', 'NS3', 'NS4A', '2K', 'NS4B',
            'NS5'
        ]

        self.dengue = process(input_data_path=self.input_data_path,
                              store_data_path=self.store_data_path,
                              build_data_path=self.build_data_path,
                              proteins=self.proteins,
                              reference=self.reference_fname,
                              method='SLSQP')

        if params.load:
            self.dengue.load()
        else:
            self.fasta_fields = {
                0: 'strain',
                1: 'accession',
                2: 'date',
                3: 'region',
                4: 'country',
                5: 'division',
                6: 'location'
            }
            self.dengue.load_sequences(fields=self.fasta_fields)
            self.dengue.seqs.filter(lambda s: len(s.seq) >= 5000)
            self.dropped_strains = []
            self.dengue.seqs.filter(lambda s: s.id not in self.dropped_strains)
            self.dengue.seqs.subsample(
                category=lambda x: (x.attributes['region'], x.attributes[
                    'date'].year, x.attributes['date'].month),
                threshold=params.viruses_per_month)
            self.dengue.align()
            self.dengue.build_tree()

        self.dengue.clock_filter(n_iqd=3, plot=True)
        self.dengue.annotate_tree(Tc=0.005, timetree=True, reroot='best')
        self.dengue.tree.geo_inference('region')
        self.dengue.export(controls=attribute_nesting)
Пример #5
0
                        default=1.0,
                        help='number of hours raxml is run')
    parser.add_argument('--load',
                        action='store_true',
                        help='recover from file')
    params = parser.parse_args()

    lineage = 'ebola'
    input_data_path = '../fauna/data/' + lineage
    store_data_path = 'store/' + lineage + '_'
    build_data_path = 'build/' + lineage + '_'

    ebola = process(
        input_data_path=input_data_path,
        store_data_path=store_data_path,
        build_data_path=build_data_path,
        reference='ebola/metadata/ebola_outgroup.gb',
        proteins=['NP', 'VP35', 'VP40', 'GP', 'sGP', 'VP30', 'VP24', 'L'],
        method='SLSQP')

    if params.load == False:
        fasta_fields = {
            0: 'strain',
            2: 'accession',
            3: 'date',
            4: 'region',
            5: 'country',
            6: 'division',
            8: 'db',
            10: 'authors'
        }
Пример #6
0
                "menuItem": "date",
                "type": "continuous"
            },
            "gt": {
                "key": "genotype",
                "legendTitle": "Genotype",
                "menuItem": "genotype",
                "type": "discrete"
            }
        }
    }

    HA = process(input_data_path=params["HA"]["input_data"],
                 store_data_path='store/' + params["HA"]["lineage"] + '_',
                 build_data_path='build/' + params["HA"]["lineage"] + '_',
                 reference=params["HA"]["reference_fname"],
                 lat_long_fname='../fauna/source-data/geo_lat_long.tsv',
                 proteins=params["HA"]['proteins'],
                 method='SLSQP',
                 verbose=0)
    NA = process(input_data_path=params["NA"]["input_data"],
                 store_data_path='store/' + params["NA"]["lineage"] + '_',
                 build_data_path='build/' + params["NA"]["lineage"] + '_',
                 reference=params["NA"]["reference_fname"],
                 lat_long_fname='../fauna/source-data/geo_lat_long.tsv',
                 proteins=params["NA"]['proteins'],
                 method='SLSQP',
                 verbose=0)

    segments = [HA, NA]
    segmentNames = ["HA", "NA"]
    time_interval = [datetime.strptime(x, '%Y-%m-%d').date()  for x in params.time_interval]
    pivots = np.arange(time_interval[0].year+(time_interval[0].month-1)/12.0,
                       time_interval[1].year+time_interval[1].month/12.0, 1.0/ppy)

    # load data from all segments
    segment_names = ['pb1', 'pb2', 'pa', 'ha', 'np', 'na', 'ma', 'ns']
    segments = {}
    viruses = defaultdict(list)
    for seg in segment_names:
        input_data_path = '../fauna/data/'+params.lineage+'_'+seg
        if seg=='m':
            input_data_path+='p'
        store_data_path = 'store/'+params.lineage + '_' + params.resolution + '_' + seg + '_'
        build_data_path = 'build/'+params.lineage + '_' + params.resolution + '_' + seg + '_'
        flu = process(input_data_path = input_data_path, store_data_path = store_data_path,
                       build_data_path = build_data_path, reference='flu/metadata/'+params.lineage + '_' + seg +'_outgroup.gb',
                       proteins=['SigPep', 'HA1', 'HA2'],
                       method='SLSQP', inertia=np.exp(-1.0/ppy), stiffness=2.*ppy)

        flu.load_sequences(fields={0:'strain', 2:'isolate_id', 3:'date', 4:'region',
                             5:'country', 7:"city", 12:"subtype",13:'lineage'})

        print("## loading data for segment %s, found %d number of sequences"%(seg, len(flu.seqs.all_seqs)))
        for sequence in flu.seqs.all_seqs:
            viruses[sequence].append(seg)

        segments[seg] = flu

    # determine strains that are complete
    complete_strains = filter(lambda x:len(viruses[x])==len(segment_names), viruses.keys())
    # determine filter every segment down to the sequences for which all other segments exist
    segments['ha'].seqs.filter(lambda s: s.name in complete_strains)
Пример #8
0
        "estimate_tree_frequencies": not args.no_tree_freqs,
        "clean": args.clean,
        "pivot_spacing": 1.0 / 12,
        "timetree_options": {
            "Tc": 0.03
        }
    }


if __name__ == "__main__":
    args = collect_args()
    jsons = glob.glob("prepared/*.json") if "all" in args.jsons else args.jsons

    for prepared_json in jsons:
        pprint("Processing {}".format(prepared_json))
        runner = process(make_config(prepared_json, args))
        runner.align()

        # estimate mutation frequencies here.
        # While this could be in a wrapper, it is hopefully more readable this way!
        if runner.config["estimate_mutation_frequencies"]:
            pivots = runner.get_pivots_via_spacing()
            runner.estimate_mutation_frequencies(pivots=pivots,
                                                 min_freq=0.02,
                                                 inertia=np.exp(-1.0 / 12),
                                                 stiffness=0.8 * 12)
            acronyms = set(
                [x[1] for x in runner.info["regions"] if x[1] != ""])
            region_groups = {
                str(x):
                [str(y[0]) for y in runner.info["regions"] if y[1] == x]
Пример #9
0
    parser.add_argument('--load',
                        action='store_true',
                        help='recover from file')
    params = parser.parse_args()

    lineage = 'zika'
    input_data_path = '../fauna/data/' + lineage
    store_data_path = 'store/' + lineage + '_'
    build_data_path = 'build/' + lineage + '_'

    zika = process(input_data_path=input_data_path,
                   store_data_path=store_data_path,
                   build_data_path=build_data_path,
                   reference='zika/metadata/zika_outgroup.gb',
                   lat_long_fname='../fauna/source-data/geo_lat_long.tsv',
                   proteins=[
                       'CA', 'PRO', 'MP', 'ENV', 'NS1', 'NS2A', 'NS2B', 'NS3',
                       'NS4A', 'NS4B', 'NS5'
                   ],
                   method='SLSQP',
                   verbose=params.verbose)
    if params.load:
        zika.load()
    else:
        fasta_fields = {
            0: 'strain',
            2: 'accession',
            3: 'date',
            4: 'region',
            5: 'country',
            6: 'division',
Пример #10
0
    parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization')
    parser.add_argument('-v', '--viruses_per_month', type = int, default = 100, help='number of viruses sampled per month')
    parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run')
    parser.add_argument('--load', action='store_true', help = 'recover from file')
    params = parser.parse_args()

    lineage = 'zika'
    input_data_path = '../fauna/data/'+lineage
    store_data_path = 'store/'+lineage + '_'
    build_data_path = 'build/'+lineage + '_'

    zika = process(input_data_path = input_data_path,
                   store_data_path = store_data_path,
                   build_data_path = build_data_path,
                   reference='zika/metadata/zika_outgroup.gb',
                   proteins=['CA', 'PRO', 'MP', 'ENV', 'NS1', 'NS2A',
                             'NS2B', 'NS3', 'NS4A', 'NS4B', 'NS5'],
                   method='SLSQP')

    if params.load:
        zika.load()
    else:
        fasta_fields = {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country',
                        6:'division', 8:'db', 10:'authors', 11:'latitude', 12:'longitude'}
        zika.load_sequences(fields=fasta_fields)
        zika.seqs.filter(lambda s: s.attributes['date']>=datetime(2012,1,1).date() and
                                   s.attributes['date']< datetime(2017,1,1).date())
        zika.seqs.filter(lambda s: len(s.seq)>=2000)                                   
        dropped_strains = [
            "THA/PLCal_ZV/2013", "PLCal_ZV", # true strains, too basal for analysis