Пример #1
0
def simulate():

    rearrange()

    glfo, naive_event_list, cpath = utils.read_output(naive_fname())
    assert len(naive_event_list) == args.n_sim_events

    outdirs = [
        '%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list))
    ]

    for ievent, (naive_line,
                 outdir) in enumerate(zip(naive_event_list, outdirs)):
        run_bcr_phylo(naive_line, outdir, ievent)

    if utils.output_exists(
            args, simfname(), outlabel='mutated simu', offset=4
    ):  # i guess if it crashes during the plotting just below, this'll get confused
        return

    mutated_events = []
    for ievent, (naive_line,
                 outdir) in enumerate(zip(naive_event_list, outdirs)):
        mutated_events.append(
            parse_bcr_phylo_output(glfo, naive_line, outdir, ievent))

    print '  writing annotations to %s' % simfname()
    utils.write_annotations(simfname(), glfo, mutated_events,
                            utils.simulation_headers)

    import plotting
    for outdir, event in zip(outdirs, mutated_events):
        plotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr,
                                           args.metric_for_target_distance)
Пример #2
0
def test_advanced(output_test_path, rendered_template, helper, client):
    form = AdvancedForm()
    pack = helper.template_pack

    helper.layout = Layout(
        Row(
            Column(
                'simple',
                css_class='six'
            ),
            Column(
                'opt_in',
                css_class='six'
            ),
        ),
        Row(
            Column(
                'longtext'
            ),
        ),
        Row(
            Column(
                ButtonHolder(Submit('submit', 'Submit')),
            ),
            css_class="large"
        ),
    )

    rendered = rendered_template(form, helper=helper)

    attempted = read_output(output_test_path, pack, "test_advanced.html")
    #write_output(output_test_path, pack, "test_advanced.html", rendered)

    assert rendered == attempted
Пример #3
0
 def __init__(self, root_user_id="", max_population=24, \
   max_friends_per_user=5, community_file="", new=True, safe=False):
     '''Either load a prexisting community to add to or start a new one.
     If not starting a new community then root_user_id doesn't do
     anything. Community is loaded/saved to community_file.'''
     logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG,
       format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
       datefmt='%m-%d %H:%M')
     self._new = new
     self._write_path = community_file
     self._max_population = max_population
     self._max_friends_per_user = max_friends_per_user
     self._safe = safe
     #new community
     if new and root_user_id:
         self._root_user_id = root_user_id
         self._node_pool = {}
         self._community_members = []
         self._community = {
             'root_user_id':root_user_id,
             'node_pool':{},
             'members':[],
         }
     #add to existing community
     elif not new:
         self._community = utils.read_output(community_file)
         self._root_user_id = self._community['root_user_id']
         self._node_pool = self._community['node_pool']
         self._community_members = self._community['members']
Пример #4
0
def simulate():

    rearrange()

    glfo, naive_event_list, cpath = utils.read_output('%s/naive-simu.yaml' %
                                                      simdir(args.stype))
    assert len(naive_event_list) == args.n_sim_events

    outdirs = [
        '%s/event-%d' % (simdir(args.stype), i)
        for i in range(len(naive_event_list))
    ]

    print '    running bcr-phylo for %d naive rearrangements' % len(
        naive_event_list)
    for ievent, (naive_line,
                 outdir) in enumerate(zip(naive_event_list, outdirs)):
        run_bcr_phylo(naive_line, outdir, ievent)

    mutated_events = []
    for ievent, (naive_line,
                 outdir) in enumerate(zip(naive_event_list, outdirs)):
        mutated_events.append(
            parse_bcr_phylo_output(glfo, naive_line, outdir, ievent))

    print '  writing annotations to %s' % simfname(args.stype)
    utils.write_annotations(simfname(args.stype), glfo, mutated_events,
                            utils.simulation_headers)

    import plotting
    for outdir, event in zip(outdirs, mutated_events):
        plotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr)
Пример #5
0
 def __init__(self, root_user_id="", max_population=24, \
   max_friends_per_user=5, community_file="", new=True, safe=False):
     '''Either load a prexisting community to add to or start a new one.
     If not starting a new community then root_user_id doesn't do
     anything. Community is loaded/saved to community_file.'''
     logging.basicConfig(
         filename=LOG_FILENAME,
         level=logging.DEBUG,
         format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
         datefmt='%m-%d %H:%M')
     self._new = new
     self._write_path = community_file
     self._max_population = max_population
     self._max_friends_per_user = max_friends_per_user
     self._safe = safe
     #new community
     if new and root_user_id:
         self._root_user_id = root_user_id
         self._node_pool = {}
         self._community_members = []
         self._community = {
             'root_user_id': root_user_id,
             'node_pool': {},
             'members': [],
         }
     #add to existing community
     elif not new:
         self._community = utils.read_output(community_file)
         self._root_user_id = self._community['root_user_id']
         self._node_pool = self._community['node_pool']
         self._community_members = self._community['members']
Пример #6
0
def read_partis_output(partition_file, glfo_dir=None, locus=None):
    glfo = (None if utils.getsuffix(partition_file)
            == ".yaml" else glutils.read_glfo(
                glfo_dir if glfo_dir else default_glfo_dir, locus))
    glfo, annotation_list, cpath = utils.read_output(
        partition_file, glfo=glfo
    )  # returns glfo from the file if it's there, otherwise it returns the one we passed in
    return glfo, annotation_list, cpath
Пример #7
0
def processed_data(args):
    """Uses args to find the correct partition, cluster pair and all associated information. Cluster
    information is returned as by process_cluster."""

    print("calling utils.read_output with args:", args.partition_file,
          args.glfo)
    file_glfo, annotation_list, cpath = utils.read_output(args.partition_file,
                                                          glfo=args.glfo)
    if annotation_list is None:
        raise Exception('cluster annotation file not found')
    if file_glfo:  # will only be set if we're reading a yaml file
        args.glfo = file_glfo

    # select partition, relative to best partition
    ipart = cpath.i_best + args.partition

    # select cluster; unique_ids takes highest precedence
    if args.unique_ids:
        cluster_unique_ids = args.unique_ids
    # default to seed, when possibile
    elif cpath.seed_unique_id and not args.cluster:
        cluster_unique_ids = next(cluster
                                  for cluster in cpath.partitions[ipart]
                                  if cpath.seed_unique_id in cluster)
    # otherwise, assume we have args.cluster or default it to 0
    else:
        clusters = sorted(cpath.partitions[ipart], key=len, reverse=True)
        cluster_unique_ids = clusters[args.cluster or 0]

    # Get cluster annotation and put together into
    annotations = [
        l for l in annotation_list if l['unique_ids'] == cluster_unique_ids
    ]
    if len(annotations) == 0:
        raise ValueError(
            'requested uids %s not found in %s' %
            (cluster_unique_ids, args.partition_file)
        )  # it was a value error before, so I'm leaving it at that
    elif len(annotations) > 1:
        print '%s more than one annotation with requested uids %s found in %s' % (
            utils.color('red', 'warning'), cluster_unique_ids,
            args.partition_file)  # shouldn't be possible
    cluster_annotation = annotations[0]
    data = {
        'n_clusters': len(cpath.partitions[ipart]),
        'logprob': cpath.logprobs[ipart],
        'partition_file': args.partition_file,
        'last_modified': time.ctime(os.path.getmtime(args.partition_file))
    }
    if args.seqs_out:
        data['seqs_file'] = os.path.relpath(args.seqs_out,
                                            args.paths_relative_to)
    # Process the annotation file specific details/data
    data.update(process_cluster(args, cluster_annotation,
                                cpath.seed_unique_id))
    return data
Пример #8
0
def test_basic(output_test_path, rendered_template, helper, client):
    form = BasicInputForm()
    pack = helper.template_pack

    rendered = rendered_template(form, helper=helper)

    attempted = read_output(output_test_path, pack, "test_basic.html")
    #write_output(output_test_path, pack, "test_basic.html", rendered)

    assert rendered == attempted
Пример #9
0
def test_layout(output_test_path, rendered_template, helper, client):
    form = BasicInputFormLayoutIncluded(helper=helper)
    pack = helper.template_pack

    rendered = rendered_template(form)

    attempted = read_output(output_test_path, pack, "test_layout.html")
    #write_output(output_test_path, pack, "test_layout.html", rendered)

    assert rendered == attempted
Пример #10
0
def main():
    community = read_output('pickled_populations/lizardbill_11_20_2010')
    s = PopulationStats(community['members'])
    community_member_names = s.all_user_names()
    for user in community['members']:
        print ''
        print 'User:'******'screen_name']
        print 'ID:', user['uid']
        print 'Tweet Count:', len(user['tweets'])
        print 'Friend IDs:', len(user['friend_ids'])
        print 'Follower IDs:', len(user['follower_ids'])
    print 'Community Members:', len(community['members'])
Пример #11
0
def read_rearrangements():
    if args.paired_loci:
        lp_infos = paircluster.read_lpair_output_files(
            lpairs(), naive_fname, dbgstr='naive simulation')
        naive_events = paircluster.get_both_lpair_antn_pairs(
            lpairs(), lp_infos)
        glfos, _, _ = paircluster.concat_heavy_chain(
            lpairs(), lp_infos)  # per-locus glfos with concat'd heavy chain
    else:
        glfo, naive_events, _ = utils.read_output(naive_fname(None))
        glfos = [glfo]
    return glfos, naive_events
Пример #12
0
def main():
    community = read_output('pickled_populations/lizardbill_11_20_2010')
    s = PopulationStats(community['members'])
    community_member_names = s.all_user_names()
    for user in community['members']:
        print ''
        print 'User:'******'screen_name']
        print 'ID:', user['uid']
        print 'Tweet Count:', len(user['tweets'])
        print 'Friend IDs:', len(user['friend_ids'])
        print 'Follower IDs:', len(user['follower_ids'])
    print 'Community Members:', len(community['members'])
Пример #13
0
def test_inlinefield(output_test_path, rendered_template, helper, client):
    form = BasicInputForm()
    pack = helper.template_pack

    helper.layout = Layout(
        InlineField('simple',
                    label_column='large-7',
                    input_column='large-5',
                    label_class='foobar'))

    rendered = rendered_template(form, helper=helper)

    attempted = read_output(output_test_path, pack, "test_inlinefield.html")
    #write_output(output_test_path, pack, "test_inlinefield.html", rendered)

    assert rendered == attempted
Пример #14
0
def test_buttongroup(output_test_path, rendered_template, helper, client):
    form = BasicInputForm()
    pack = helper.template_pack

    helper.layout = Layout(
        'simple',
        ButtonGroup(
            Submit('Save', 'Save'),
            Button('Cancel', 'Cancel'),
        )
    )

    rendered = rendered_template(form, helper=helper)

    attempted = read_output(output_test_path, pack, "test_buttongroup.html")
    #write_output(output_test_path, pack, "test_buttongroup.html", rendered)

    assert rendered == attempted
Пример #15
0
def test_tab(output_test_path, rendered_template, helper, client):
    form = AdvancedForm()
    pack = helper.template_pack

    helper.layout = Layout(
        TabHolder(
            TabItem('My tab 1', 'simple'),
            TabItem('My tab 2', 'opt_in'),
            TabItem('My tab 3', 'longtext'),
            css_id="meep-meep"
        )
    )

    rendered = rendered_template(form, helper=helper)

    attempted = read_output(output_test_path, pack, "test_tab.html")
    #write_output(output_test_path, pack, "test_tab.html", rendered)

    assert attempted == rendered
Пример #16
0
def test_inlineswitchfield(output_test_path, rendered_template, helper,
                           client):
    form = BoolInputForm()
    pack = helper.template_pack

    helper.layout = Layout(
        InlineSwitchField('opt_in',
                          label_column='large-8',
                          input_column='large-4',
                          label_class='foobar',
                          switch_class="inline"))

    rendered = rendered_template(form, helper=helper)

    attempted = read_output(output_test_path, pack,
                            "test_inlineswitchfield.html")
    #write_output(output_test_path, pack, "test_inlineswitchfield.html", rendered)

    assert rendered == attempted
Пример #17
0
def test_accordion(output_test_path, rendered_template, helper, client):
    form = AdvancedForm()
    pack = helper.template_pack

    # Define 'css_id' to avoid test fails with automatic generated random ID
    helper.layout = Layout(
        AccordionHolder(
            AccordionItem('Group 1', 'simple'),
            AccordionItem('Group 2', 'opt_in'),
            AccordionItem('Group 3', 'longtext'),
            css_id="meep-meep"
        )
    )

    rendered = rendered_template(form, helper=helper)

    attempted = read_output(output_test_path, pack, "test_accordion.html")
    #write_output(output_test_path, pack, "test_accordion.html", rendered)

    assert attempted == rendered
Пример #18
0
def simulate():

    rearrange()

    glfo, naive_event_list, cpath = utils.read_output(naive_fname())
    assert len(naive_event_list) == args.n_sim_events

    outdirs = ['%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list))]

    start = time.time()
    cmdfos = []
    if args.n_procs > 1:
        print '    starting %d events' % len(naive_event_list)
    uid_str_len = 6 + int(math.log(len(naive_event_list), 10))  # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates
    for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)):
        if args.n_sim_events > 1 and args.n_procs == 1:
            print '  %s %d' % (utils.color('blue', 'ievent'), ievent)
        cfo = run_bcr_phylo(naive_line, outdir, ievent, len(naive_event_list), uid_str_len=uid_str_len)  # if n_procs > 1, doesn't run, just returns cfo
        if cfo is not None:
            print '      %s %s' % (utils.color('red', 'run'), cfo['cmd_str'])
            cmdfos.append(cfo)
    if args.n_procs > 1 and len(cmdfos) > 0:
        utils.run_cmds(cmdfos, shell=True, n_max_procs=args.n_procs, batch_system='slurm' if args.slurm else None, allow_failure=True, debug='print')
    print '  bcr-phylo run time: %.1fs' % (time.time() - start)

    if utils.output_exists(args, simfname(), outlabel='mutated simu', offset=4):  # i guess if it crashes during the plotting just below, this'll get confused
        return

    start = time.time()
    mutated_events = []
    for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)):
        mutated_events.append(parse_bcr_phylo_output(glfo, naive_line, outdir, ievent))
    print '  parsing time: %.1fs' % (time.time() - start)

    print '  writing annotations to %s' % simfname()
    utils.write_annotations(simfname(), glfo, mutated_events, utils.simulation_headers)

    if not args.only_csv_plots:
        import lbplotting
        for outdir, event in zip(outdirs, mutated_events):
            lbplotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr, lbplotting.metric_for_target_distance_labels[args.metric_for_target_distance])
Пример #19
0
    '--max-family-size',
    type=int,
    help='subset each family down to this size before passing to treeutils')
parser.add_argument(
    '--min-selection-metric-cluster-size',
    type=int,
    default=treeutils.default_min_selection_metric_cluster_size)
parser.add_argument('--include-relative-affy-plots', action='store_true')
# tree plots turned off in the treeutils fcn atm
# parser.add_argument('--ete-path', default=('/home/%s/anaconda_ete/bin' % os.getenv('USER')) if os.getenv('USER') is not None else None)
# parser.add_argument('--workdir')  # only required to make ete trees
args = parser.parse_args()

if args.n_max_queries is not None:
    print '    --n-max-queries set to %d' % args.n_max_queries
glfo, true_lines, _ = utils.read_output(args.infname,
                                        n_max_queries=args.n_max_queries)

# numpy.random.seed(1)
if args.max_family_size is not None:
    for line in [
            l for l in true_lines
            if len(l['unique_ids']) > args.max_family_size
    ]:
        iseqs_to_keep = numpy.random.choice(range(len(line['unique_ids'])),
                                            args.max_family_size)
        utils.restrict_to_iseqs(line, iseqs_to_keep, glfo)

if args.metric_method == 'dtr':
    treeutils.calculate_tree_metrics(
        None,
        args.lb_tau,
Пример #20
0
                               reverse=True)
    total_prob = 0.
    for naive_seq, prob in nseq_info:
        print '  %s   %5.2f  %s' % (utils.color_mutants(
            naive_seq if ref_seq is None else ref_seq,
            naive_seq), prob, utils.color(namecolor, namestr))
        if ref_seq is None:
            ref_seq = naive_seq
        if 1. - total_prob < args.prob_to_ignore:
            break
        total_prob += prob
    return ref_seq


glfo, annotation_list, cpath = utils.read_output(
    '%s/%s/partition-with-alternative-annotations.yaml' %
    (args.basedir, args.locus))
lh_info = read_linearham_output()

# print annotations for the biggest cluster in the most likely partition
annotations = {
    ':'.join(adict['unique_ids']): adict
    for adict in annotation_list
}  # collect the annotations in a dictionary so they're easier to access
most_likely_partition = cpath.partitions[
    cpath.
    i_best]  # a partition is represented as a list of lists of strings, with each string a sequence id
sorted_clusters = sorted(most_likely_partition, key=len, reverse=True)
for cluster in sorted_clusters:
    line = annotations[':'.join(cluster)]
    print ':'.join(line['unique_ids'])
Пример #21
0
def run_train(args, hypers):
	system_check_and_init(args)
	if args.gpu:
		print "GPU available"
	else:
		print "CPU only"

	word_v = vocabulary()
	char_v = vocabulary()
	actn_v = vocabulary()
	pretrain = PretrainedEmb(args.pretrain_path)

	#instances
	train_input = read_input(args.train_input)
	dev_input = read_input(args.dev_input)

	singleton_idx_dict, word_dict, word_v = get_singleton_dict(train_input, word_v)
	extra_vl = [ vocabulary() for i in range(len(train_input[0])-1)]	
	train_instance, word_v, char_v, extra_vl = input2instance(train_input, word_v, char_v, pretrain, extra_vl, word_dict, args, "train")
	word_v.freeze()
	char_v.freeze()
	for i in range(len(extra_vl)):
		extra_vl[i].freeze()
	dev_instance, word_v, char_v, extra_vl = input2instance(dev_input, word_v, char_v, pretrain, extra_vl, {}, args, "dev")

	train_output = read_output(args.train_action)
	dev_output = read_output(args.dev_action)
	train_action, actn_v = output2action(train_output, actn_v)
	#dev_actoin, actn_v = output2action(dev_output, actn_v)

	print "word vocabulary size:", word_v.size()
	print "char vocabulary size:", char_v.size() - 1
	print "pretrain vocabulary size:", pretrain.size() - 1
	extra_vl_size = []
	for i in range(len(extra_vl)):
		print "extra", i, "vocabulary size:", extra_vl[i].size()
		extra_vl_size.append(extra_vl[i].size())
	print "action vocaluary size:", actn_v.size() - 1
	actn_v.freeze()
	actn_v.dump()

	# neural components
	input_representation = token_representation(word_v.size(), char_v.size(), pretrain, extra_vl_size, args)
	encoder = None
	if args.encoder == "BILSTM":
		encoder = bilstm_encoder(args)
	elif args.encoder == "Transformer":
		encoder = transformer(args)
	assert encoder, "please specify encoder type"
	
	decoder = in_order_constituent_parser(actn_v.size(), actn_v.toidx("TERM"), args)
	mask = in_order_constituent_parser_mask(actn_v)

	if args.gpu:
		encoder = encoder.cuda()
		decoder = decoder.cuda()
		input_representation = input_representation.cuda()
	
	#training process
	model_parameters = list(encoder.parameters()) + list(decoder.parameters()) + list(input_representation.parameters())
	#model_optimizer = optimizer(args, model_parameters)
	lr = args.learning_rate_f

	i = len(train_instance)
	check_iter = 0
	check_loss = 0
	bscore = -1
	epoch = -1
	while True:
		for p in model_parameters:
			if p.grad is not None:
				p.grad.detach_()
				p.grad.zero_()
		
		if i == len(train_instance):
			i = 0
			epoch += 1
			lr = args.learning_rate_f / (1 + epoch * args.learning_rate_decay_f)

		check_iter += 1
		input_t = input_representation(train_instance[i], singleton_idx_dict=singleton_idx_dict, test=False)
		enc_rep_t = encoder(input_t, test=False)
		loss_t = decoder(enc_rep_t, mask, train_action[i], test=False)
		check_loss += loss_t.data.tolist()

		if check_iter % args.check_per_update == 0:
			print('epoch %.6f : %.10f [lr: %.6f]' % (check_iter*1.0/len(train_instance), check_loss*1.0 / args.check_per_update, lr))
			check_loss = 0
		
		if check_iter % args.eval_per_update == 0:
			trees = []
			for j, instance in enumerate(dev_instance):
				dev_input_embeddings = input_representation(instance)
				dev_enc_rep = encoder(dev_input_embeddings)
				dev_action_output = decoder(dev_enc_rep, mask)
				#print dev_action_output
				#print dev_input[j][0][1:-1]
				#print dev_input[j][-1][1:-1]
				trees.append(in_order_constituent_action2tree(dev_action_output, actn_v, dev_input[j][0][1:-1], dev_input[j][-1][1:-1]))
			with open("tmp/dev.output.tmp", "w") as w:
				for tree in trees:
					w.write(tree+"\n")
				w.flush()
				w.close()
			score = constituent_parser_eval(args)
			print('dev F-score %.10f ' % (score))
			if score >= bscore:
				bscore = score
				torch.save({"encoder":encoder.state_dict(), "decoder":decoder.state_dict(), "input_representation": input_representation.state_dict()}, args.model_path_base+"/model")
		i += 1
		loss_t.backward()
		torch.nn.utils.clip_grad_value_(model_parameters, 5)
		#model_optimizer.step()
		for p in model_parameters:
			if p.requires_grad:
				p.data.add_(-lr, p.grad.data)
Пример #22
0
from clusterpath import ClusterPath

parser = argparse.ArgumentParser()
parser.add_argument('--fname',
                    default=partis_dir +
                    '/test/reference-results/partition-ref-simu.yaml')
parser.add_argument('--glfo-dir', default=partis_dir + '/data/germlines/human')
parser.add_argument('--locus', default='igh')
args = parser.parse_args()

glfo = None
if utils.getsuffix(args.fname) == '.csv':
    print '  reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir
    glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.fname, glfo=glfo)

if cpath is None or len(cpath.partitions) == 0:
    print 'no partitions read from %s, so just printing first annotation:' % args.fname
    utils.print_reco_event(annotation_list[0])
    sys.exit(0)

print utils.color('green', 'list of partitions:')
cpath.print_partitions(
    abbreviate=True
)  # 'abbreviate' print little 'o's instead of the full sequence ids

# print annotations for the biggest cluster in the most likely partition
annotations = {
    ':'.join(adict['unique_ids']): adict
    for adict in annotation_list
Пример #23
0
        tweet_days = {}
        for member in self._community_members:
            tweets = member['tweets']
            for tweet in tweets:
                raw_timestamp = tweet['created_at']
                formatted_timestamp = \
                  parse_twitter_timestamp(raw_timestamp)
                if formatted_timestamp in tweet_days:
                    tweet_days[formatted_timestamp].append(tweet)
                else:
                    tweet_days[formatted_timestamp] = [tweet]
        return tweet_days

    def words_by_day(self):
        word_counts = {}
        for day, tweets in self._tweets_by_day.items():
            tweet_text = ''
            for tweet in tweets:
                tweet_text += tweet['text']
            day_counts = WordCounter(tweet_text).get_word_data()
            word_counts[day] = day_counts
        return word_counts


if __name__ == "__main__":
    community = read_output('pickled_populations/lizardbill_11_20_2010')
    dc = DayCounts(community)
    wc = dc.words_by_day()
    import pprint
    pprint.pprint(wc)
Пример #24
0
if 'extract-fasta.py' in sys.argv[0]:  # if they're trying to run this old script, which is now just a link to this one, print a warning and rejigger the arguments so it still works
    print '  note: running deprecated script %s, which currently is just a link pointing to %s' % (os.path.basename(sys.argv[0]), os.path.basename(os.path.realpath( __file__)))
    print '  note: transferring deprecated arguments --input-file and --fasta-output-file to the first two positional arguments (this will continue to work, you only need to change things if you want this warning to go away)'
    utils.insert_in_arglist(sys.argv, [utils.get_val_from_arglist(sys.argv, '--input-file'), utils.get_val_from_arglist(sys.argv, '--fasta-output-file')], sys.argv[0])
    utils.remove_from_arglist(sys.argv, '--input-file', has_arg=True)
    utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True)

args = parser.parse_args()
args.extra_columns = utils.get_arg_list(args.extra_columns)
assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta']

default_glfo_dir = partis_dir + '/data/germlines/human'
if utils.getsuffix(args.infile) == '.csv' and args.glfo_dir is None:
    print '  note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir
    args.glfo_dir = default_glfo_dir
glfo, annotation_list, cpath = utils.read_output(args.infile, glfo_dir=args.glfo_dir, locus=args.locus)

if args.plotdir is not None:
    from parametercounter import ParameterCounter
    setattr(args, 'region_end_exclusions', {r : [0 for e in ['5p', '3p']] for r in utils.regions})  # hackity hackity hackity
    pcounter = ParameterCounter(glfo, args)
    for line in annotation_list:
        pcounter.increment(line)
    pcounter.plot(args.plotdir) #, make_per_base_plots=True) #, only_overall=True, make_per_base_plots=True
    sys.exit(0)

if cpath is None or cpath.i_best is None:
    clusters_to_use = [l['unique_ids'] for l in annotation_list]
    print '  no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use))
else:
    ipartition = cpath.i_best if args.partition_index is None else args.partition_index
Пример #25
0
parser.add_argument('--locus', default='igh')
args = parser.parse_args()
if args.title == 'good':
    args.title = 'none'
elif args.title == 'chimeras':
    args.title = 'all chimeras'


def gk(uids):
    return ':'.join(uids)


glfo = None
if utils.getsuffix(args.infile) == '.csv':
    glfo = glutils.read_glfo(args.glfo_dir, args.locus)
glfo, annotation_list, _ = utils.read_output(args.infile, glfo=glfo)
annotations = collections.OrderedDict(
    (line['unique_ids'][0], line) for line in annotation_list)

chfo = {
    uid: {
        k: v
        for k, v in zip(
            ('imax', 'max_abs_diff'),
            utils.get_chimera_max_abs_diff(
                annotations[uid], iseq=0, chunk_len=args.chunk_len))
    }
    for uid in annotations
}
biggest_adiffs = sorted(chfo,
                        key=lambda q: chfo[q]['max_abs_diff'],
Пример #26
0
    def _split_tweets(self):
        tweet_days = {}
        for member in self._community_members:
            tweets = member['tweets']
            for tweet in tweets:
                raw_timestamp = tweet['created_at']
                formatted_timestamp = \
                  parse_twitter_timestamp(raw_timestamp)
                if formatted_timestamp in tweet_days:
                    tweet_days[formatted_timestamp].append(tweet)
                else:
                    tweet_days[formatted_timestamp] = [tweet]
        return tweet_days

    def words_by_day(self):
        word_counts = {}
        for day, tweets in self._tweets_by_day.items():
            tweet_text = ''
            for tweet in tweets:
                tweet_text += tweet['text']
            day_counts = WordCounter(tweet_text).get_word_data()
            word_counts[day] = day_counts
        return word_counts

if __name__ == "__main__":
    community = read_output('pickled_populations/lizardbill_11_20_2010')
    dc = DayCounts(community)
    wc = dc.words_by_day()
    import pprint
    pprint.pprint(wc)
Пример #27
0
    type=int,
    help=
    'take only the first N seqs from both the fasta file and the annotation in the partis output file (e.g. for testing when the family is huge)'
)
args = parser.parse_args()

new_seqfos = utils.read_fastx(args.new_seq_file, sanitize_seqs=True)
print '    read %d seqs from %s' % (len(new_seqfos), args.new_seq_file)

glfo = None
if utils.getsuffix(args.partis_output_file) == '.csv':
    print '    reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir
    glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.partis_output_file,
                                                 glfo=glfo,
                                                 locus=args.locus)
if args.partition_index is not None:
    print '  using non-best partition index %d (best is %d)' % (
        args.partition_index, cpath.i_best)
partition = cpath.partitions[cpath.i_best if args.
                             partition_index is None else args.partition_index]
print '    read partition with %d clusters from %s' % (len(partition),
                                                       args.partis_output_file)

new_uids = set(sfo['name'] for sfo in new_seqfos)
clusters_with_overlap = []
for cluster in partition:
    overlap_uids = set(cluster) & new_uids
    if len(overlap_uids) > 0:
        clusters_with_overlap.append((cluster, overlap_uids))
Пример #28
0
parser.add_argument('--seed-unique-id', help='if set, take sequences only from the cluster containing this seed sequence, rather than the default of taking all sequences from all clusters')
parser.add_argument('--cluster-index', type=int, help='if set, take sequences only from the cluster at this index in the partition, rather than the default of taking all sequences from all clusters')
parser.add_argument('--indel-reversed-seqs', action='store_true', help='if set, take sequences that have had any shm indels "reversed" (i.e. insertions are reversed, and deletions are replaced with the germline bases) rather than the default of using sequences from the original input file. Indel-reversed sequences can be convenient because they are by definition the same length as and aligned to the naive sequence.')
parser.add_argument('--glfo-dir', help='Directory with germline info. Only necessary for old-style csv output files. Equivalent to a parameter dir with \'/hmm/germline-sets\' appended.')
parser.add_argument('--locus', default='igh', help='only used for old-style csv output files')
args = parser.parse_args()

glfo = None
if utils.getsuffix(args.input_file) == '.csv':
    default_glfo_dir = partis_dir + '/data/germlines/human'
    if args.glfo_dir is None:
        print '  note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir
        args.glfo_dir = default_glfo_dir
    glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.input_file, glfo=glfo)

if cpath is None:
    clusters_to_use = [l['unique_ids'] for l in annotation_list]
    print '  no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use))
else:
    ipartition = cpath.i_best if args.partition_index is None else args.partition_index
    print '  found %d clusters in %s' % (len(cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions)))
    if args.cluster_index is None:
        clusters_to_use = cpath.partitions[ipartition]
        print '    taking all %d clusters' % len(clusters_to_use)
    else:
        clusters_to_use = [cpath.partitions[ipartition][args.cluster_index]]
        print '    taking cluster at index %d' % args.cluster_index
    if args.seed_unique_id is not None:
        clusters_to_use = [c for c in clusters_to_use if args.seed_unique_id in c]  # NOTE can result in more than one cluster with the seed sequence (e.g. if this file contains intermediate annotations from seed partitioning))