def all_revisions(revisions): # TODO: stats by editor (top %, 5+ edits), by date (last 30 days), length stats if revisions: first_edit_age = datetime.utcnow() - revisions[0]['rev_parsed_date'] latest_age = datetime.utcnow() - revisions[-1]['rev_parsed_date'] ret = { 'all': set_info(revisions), '2_days': set_info(newer_than(2, revisions)), '30_days': set_info(newer_than(30, revisions)), '90_days': set_info(newer_than(90, revisions)), '365_days': set_info(newer_than(365, revisions)), 'latest_date': str(revisions[-1]['rev_parsed_date'].isoformat()), 'latest_age': latest_age.total_seconds(), 'first_date': revisions[0]['rev_parsed_date'].isoformat(), 'first_age': first_edit_age.total_seconds(), 'interval': dist_stats(get_time_diffs(revisions)) } else: ret = { 'all': '', '2_days': '', '30_days': '', '90_days': '', '365_days': '', 'latest_date': '', 'latest_age': '', 'first_date': '', 'first_age': '', 'interval': '', } return ret
def section_stats(headers): hs = [h for h in headers if get_text(h) != 'Contents'] # how not to write Python: ['h'+str(i) for i in range(1, 8)] all_headers = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'] totals = [] for header in hs: if header.getnext() is not None: # TODO: the next item after an h1 is #bodyContents div pos = header.getnext() text = '' while pos.tag not in all_headers: text += ' ' + get_text(pos) # TODO: the references section may skew the no. words under an h2 if pos.getnext() is not None: pos = pos.getnext() else: break totals.append((get_text(header).replace('[edit] ', ''), len(text.split()))) dists = {} dists['header'] = dist_stats([len(header.split()) for header, t in totals]) dists['text'] = dist_stats([text for h, text in totals]) return dists
def set_info(revisions): editor_counts = get_editor_counts(revisions) sorted_editor_counts = sorted(editor_counts.iteritems(), key=lambda (k, v): v, reverse=True) sorted_editor_bytes = sorted(get_editor_bytes(revisions).iteritems(), key=lambda (k, v): v, reverse=True) abs_byte_sum = sum([abs(x['rev_diff']) for x in revisions]) return { 'count': len(revisions), 'minor_count': int(sum([rev['rev_minor_edit'] for rev in revisions])), 'byte_count': sum([rev['rev_diff'] for rev in revisions]), 'by_day': dist_stats(edits_by_day(revisions)), 'ip_edit_count': len([rev for rev in revisions if rev['rev_user'] == 0]), 'est_revert_count': len([rev for rev in revisions if 'revert' in rev['rev_comment'].lower()]), 'blank_count': len([x for x in revisions if x['rev_len'] == 0]), 'deleted_count': len([x for x in revisions if x['rev_deleted'] > 0]), 'abs_byte': dist_stats([abs(rev['rev_diff']) for rev in revisions]) if revisions else {}, 'ed_returning': len([c for c in editor_counts.itervalues() if c > RETURNING_ED_THRESHOLD]), 'ed_unique': len(editor_counts), 'ed_top_20': get_top_percent_editors(.20, sorted_editor_counts, len(revisions)), 'ed_top_5': get_top_percent_editors(.05, sorted_editor_counts, len(revisions)), 'ed_top_20_bytes': get_top_percent_editors(.20, sorted_editor_bytes, abs_byte_sum), 'ed_top_5_bytes': get_top_percent_editors(.05, sorted_editor_bytes, abs_byte_sum) }
dom_data = cast_table(in_data, attr_selector='d_') dom_stats = Orange.statistics.basic.Domain(dom_data) new_attrs = [] for attr in dom_data.domain.features: attr_c = Orange.feature.Continuous(attr.name + "_n") attr_c.getValueFrom = Orange.classification.ClassifierFromVar(whichVar=attr) transformer = Orange.data.utils.NormalizeContinuous() attr_c.getValueFrom.transformer = transformer transformer.average = dom_stats[attr].avg transformer.span = dom_stats[attr].dev new_attrs.append(attr_c) new_domain = Orange.data.Domain(new_attrs, dom_data.domain.classVar) norm_dom_data = Orange.data.Table(new_domain, dom_data) fa_res = fa_node.execute(norm_dom_data.to_numpy()[0]) out_data = Table(fa_node.A) from stats import dist_stats in_domain = norm_dom_data.domain LATENT_COUNT = min(len(in_domain.attributes)/2, len(fa_node.A)) latent_attrs = [] weights = fa_node.A.transpose() for i in range(LATENT_COUNT): cur_weights = weights[i] abs_stats = dist_stats([abs(x) for x in cur_weights]) median = abs_stats['mean'] dev_cutoff = abs_stats['std_dev'] latent_attrs.append([(in_domain[i], x) for i, x in enumerate(cur_weights) if abs(x) > median+dev_cutoff]) #sorted([(a,b) for a,b in zip(in_domain.features, fa_node.sigma)], key=lambda x: x[1], reverse=True)
def element_words_dist(elem): return lambda f: dist_stats([len(get_text(navbox).split()) for navbox in f(elem)])