def language_summarizer(resource, children, keep_details=False): """ Populate a programming_language summary list of mappings such as {value: "programming_language", count: "count of occurences"} sorted by decreasing count. """ PROG_LANG = 'programming_language' languages = [] prog_lang = getattr(resource, PROG_LANG, []) if not prog_lang: if resource.is_file: # also count files with no detection languages.append(None) else: languages.append(prog_lang) # Collect direct children expression summaries for child in children: child_summaries = get_resource_summary( child, key=PROG_LANG, as_attribute=keep_details) or [] for child_summary in child_summaries: values = [child_summary['value']] * child_summary['count'] languages.extend(values) # summarize proper languages_counter = summarize_languages(languages) summarized = sorted_counter(languages_counter) set_resource_summary(resource, key=PROG_LANG, value=summarized, as_attribute=keep_details) return summarized
def license_summarizer(resource, children, keep_details=False): """ Populate a license_expressions list of mappings such as {value: "expression", count: "count of occurences"} sorted by decreasing count. """ LIC_EXP = 'license_expressions' license_expressions = [] # Collect current data lic_expressions = getattr(resource, LIC_EXP, []) if not lic_expressions and resource.is_file: # also count files with no detection license_expressions.append(None) else: license_expressions.extend(lic_expressions) # Collect direct children expression summary for child in children: child_summaries = get_resource_summary( child, key=LIC_EXP, as_attribute=keep_details) or [] for child_summary in child_summaries: # TODO: review this: this feels rather weird values = [child_summary['value']] * child_summary['count'] license_expressions.extend(values) # summarize proper licenses_counter = summarize_licenses(license_expressions) summarized = sorted_counter(licenses_counter) set_resource_summary(resource, key=LIC_EXP, value=summarized, as_attribute=keep_details) return summarized
def build_summary(resource, children, attribute, summarizer, keep_details=False): """ Update the `resource` Resource with a summary of itself and its `children` Resources and this for the `attribute` key (such as copyrights, etc). - `attribute` is the name of the attribute ('copyrights', 'holders' etc.) - `summarizer` is a function that takes a list of texts and returns summarized texts with counts """ # Collect current data values = getattr(resource, attribute, []) no_detection_counter = 0 if values: # keep current data as plain strings candidate_texts = [entry.get('value') for entry in values] else: candidate_texts = [] if resource.is_file: no_detection_counter += 1 # Collect direct children existing summaries for child in children: child_summaries = get_resource_summary( child, key=attribute, as_attribute=keep_details) or [] for child_summary in child_summaries: count = child_summary['count'] value = child_summary['value'] if value: candidate_texts.append(Text(value, value, count)) else: no_detection_counter += count # summarize proper using the provided function summarized = summarizer(candidate_texts) # add back the counter of things without detection if no_detection_counter: summarized.update({None: no_detection_counter}) summarized = sorted_counter(summarized) if TRACE: logger_debug('COPYRIGHT summarized:', summarized) set_resource_summary(resource, key=attribute, value=summarized, as_attribute=keep_details) return summarized
def summarize_codebase_key_files(codebase, **kwargs): """ Summarize codebase key files. """ summarizable_attributes = codebase.attributes.summary.keys() if TRACE: logger_debug('summarizable_attributes:', summarizable_attributes) # TODO: we cannot summarize packages with "key files for now really_summarizable_attributes = set([ 'license_expressions', 'copyrights', 'holders', 'authors', 'programming_language', # 'packages', ]) summarizable_attributes = [ k for k in summarizable_attributes if k in really_summarizable_attributes ] # create one counter for each summarized attribute summarizable_values_by_key = OrderedDict([ (key, []) for key in summarizable_attributes ]) # filter to get only key files key_files = (res for res in codebase.walk(topdown=True) if (res.is_file and res.is_top_level and ( res.is_readme or res.is_legal or res.is_manifest))) for resource in key_files: for key, values in summarizable_values_by_key.items(): # note we assume things are stored as extra-data, not as direct # Resource attributes res_summaries = get_resource_summary( resource, key=key, as_attribute=False) or [] for summary in res_summaries: # each summary is a mapping with value/count: we transform back to values values.extend([summary['value']] * summary['count']) summary_counters = [] for key, values in summarizable_values_by_key.items(): summarized = summarize_values(values, key) summary_counters.append((key, summarized)) sorted_summaries = OrderedDict([(key, sorted_counter(counter)) for key, counter in summary_counters]) codebase.attributes.summary_of_key_files = sorted_summaries if TRACE: logger_debug('codebase summary_of_key_files:', sorted_summaries)
def summarize_codebase_by_facet(codebase, **kwargs): """ Summarize codebase by facte. """ from summarycode import facet as facet_module summarizable = codebase.attributes.summary.keys() if TRACE: logger_debug('summarize_codebase_by_facet for attributes:', summarizable) # create one group of by-facet values lists for each summarized attribute summarizable_values_by_key_by_facet = dict([ (facet, dict([(key, []) for key in summarizable])) for facet in facet_module.FACETS ]) for resource in codebase.walk(topdown=True): if not resource.is_file: continue for facet in resource.facets: # note: this will fail loudly if the facet is not a known one values_by_attribute = summarizable_values_by_key_by_facet[facet] for key, values in values_by_attribute.items(): # note we assume things are stored as extra-data, not as direct # Resource attributes res_summaries = get_resource_summary( resource, key=key, as_attribute=False) or [] for summary in res_summaries: # each summary is a mapping with value/count: we transform back to discrete values sum_value = summary.get('value') if sum_value: values.extend([sum_value] * summary['count']) final_summaries = [] for facet, summarizable_values_by_key in summarizable_values_by_key_by_facet.items( ): summary_counters = ((key, summarize_values( values, key)) for key, values in summarizable_values_by_key.items()) sorted_summaries = dict([(key, sorted_counter(counter)) for key, counter in summary_counters]) facet_summary = dict(facet=facet) facet_summary['summary'] = sorted_summaries final_summaries.append(facet_summary) codebase.attributes.summary_by_facet.extend(final_summaries) if TRACE: logger_debug('codebase summary_by_facet:', final_summaries)
def tally_codebase_key_files(codebase, field='tallies', **kwargs): """ Summarize codebase key files. """ talliables = codebase.attributes.tallies.keys() if TRACE: logger_debug('tallieables:', talliables) # TODO: we cannot summarize packages with "key files" for now talliables = [k for k in talliables if k in TALLYABLE_ATTRS] # create one counter for each summarized attribute talliable_values_by_key = dict([(key, []) for key in talliables]) # filter to get only key files key_files = (res for res in codebase.walk(topdown=True) if (res.is_file and res.is_top_level and ( res.is_readme or res.is_legal or res.is_manifest))) for resource in key_files: for key, values in talliable_values_by_key.items(): # note we assume things are stored as extra-data, not as direct # Resource attributes res_tallies = get_resource_tallies( resource, key=key, as_attribute=False) or [] for tally in res_tallies: # each tally is a mapping with value/count: we transform back to values tally_value = tally.get('value') if tally_value: values.extend([tally_value] * tally['count']) tally_counters = [] for key, values in talliable_values_by_key.items(): if key not in TALLYABLE_ATTRS: continue tallied = tally_values(values, key) tally_counters.append((key, tallied)) sorted_tallies = dict([(key, sorted_counter(counter)) for key, counter in tally_counters]) codebase.attributes.tallies_of_key_files = sorted_tallies if TRACE: logger_debug('codebase tallies_of_key_files:', sorted_tallies)