x = i.split() taxon = x[1:] for t in taxon: j = taxon.index(t) Index[t] = j + 1 continue elif match('Coefficients:', i): continue i.strip() x = i.split() species = x[0] if species in sample.keys(): duplicates.append(species) else: sample[species] = {} if missing == 'y': mtax = '' for t in taxon: if x[Index[t]] == '/': sample[species][t] = mtax else: sample[species][t] = x[Index[t]] mtax = x[Index[t]] else: for t in taxon:
def phy_re_analysis(options, args): """ Script should be launched as: python PhyRe.py [samplefile] [masterlistfile] s1 s2 [options] Parameters ---------- p : int permutations for confidence intervals d1 : int d1 and d2 are range for number of species for funnel plot d2 : int d1 and d2 are range for number of species for funnel plot Returns ------- first_file.out : file Results from analyses of the sample. By default, the output file has the same name of the sample file with extension .OUT second_file.out : file Results from random subsamples of the master list. The funnel output file has the same name with suffix "_funnel" and extension .OUT. I Notes ---------- Described in Phylogenetic representativeness: a new method for evaluating taxon sampling in evolutionary studies [1] References ---------- .. [1] http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-209 .. [2] http://www.mozoolab.net/downloads/manual.pdf """ samplefile = args['samplefile'] popfile = args['popfile'] d1 = args['d1'] d2 = args['d2'] output_as_string = False if options['m']: missing = options['m'] else: missing = 'n' if options['o']: out = options['o'] else: out = samplefile.split('.')[0] if options['p']: p = options['p'] else: p = 1000 if options['c']: ci = options['c'] else: ci = 'y' if options['b']: batch = options['b'] else: batch = 'n' if options['l']: pathlengths = options['l'] else: pathlengths = 'n' if options['s']: output_as_string = True sample = defaultdict() # Population - dictionary with population file information population = defaultdict() if batch == 'y': Files = [] else: Files = [samplefile] Index = {} Taxon = defaultdict() coef = {} taxon = [] pathLengths = defaultdict() for i in open(samplefile): if batch == 'y': j = i.strip() Files.append(j) else: break duplicates = [] with open(popfile) as fp: population_file_entries = fp.readlines() # We reed two first lines to check if we have System information # If we have it, we rebuild the array of lines lines_removal_counter = 0 for i in population_file_entries[:2]: # If we encounter string, starting with "Taxon" we get # information about taxons if match('Taxon:', i): lines_removal_counter += 1 x = i.split() x.remove('Taxon:') for i in x: taxon.append(i) j = x.index(i) # Index list is used to get the indexation of # taxon during string parsing Index[i] = j + 1 continue elif match('Coefficients:', i): lines_removal_counter += 1 x = i.split() x.remove('Coefficients:') x = map(eval, x) for t in taxon: i = taxon.index(t) coef[t] = sum(x[i:]) pathLengths[t] = x[i] continue population_file_entries = population_file_entries[lines_removal_counter:] # opening population file and getting information for i in population_file_entries: # here starts entry processing (species_name, species_taxonomy) = \ process_population_file_line(i, taxon, Index, missing) if species_name in population: duplicates.append(species_name) else: population[species_name] = species_taxonomy sample = population.copy() if len(duplicates) > 0: print "Population master list contains %s duplicates" \ % (len(duplicates)) if pathlengths == 'n': coef, popN, pathLengths = PathLength(population, taxon, Taxon) if pathlengths == 'y': XXX, popN, YYY = PathLength(population, taxon, Taxon) print "Finished path length calculation" results = {} """Opening all sample files (or the sample file) and getting information about it""" for f in Files: sample = get_sample_subset_from_sample_file(f, population) f = f.split('.') f = f[0] results[f] = {} samp = sample.keys() atd, taxonN, Taxon = ATDmean(sample, samp, taxon, coef) average_taxonomic_distinctness_variance = \ ATDvariance(taxonN, samp, atd, taxon, coef) euler_results = euler(sample, atd, taxonN, taxon, Taxon, coef) results[f]['atd'] = atd results[f]['vtd'] = average_taxonomic_distinctness_variance results[f]['euler'] = euler_results results[f]['N'] = taxonN results[f]['n'] = len(sample) results[f]['taxon'] = Taxon phy_re_result = printResults(taxon, taxonN, popN, pathLengths, results) funnel_data = '' if ci == 'y': funnel_data = print_funnel_data(p, d1, d2, population, taxon, coef) if output_as_string: return phy_re_result, funnel_data else: with open(out + '.out', 'w') as fp: fp.write(phy_re_result) with open(out.split('_')[0] + '_funnel.out', 'w') as fp: fp.write(funnel_data)