def save_annotation(data): user_id = data['user_id'] annotations = data['annotations'] sample_to_platform = dict( Sample.objects.filter(id__in=annotations).values_list( 'id', 'platform_id')) groups = group_by(lambda pair: sample_to_platform[pair[0]], annotations.items()) for platform_id, annotations in groups.items(): canonical = SeriesAnnotation.objects.create( platform_id=platform_id, annotations=1, authors=1, **project(data, ['series_id', 'tag_id', 'column', 'regex'])) canonical.fill_samples(annotations) raw_annotation = RawSeriesAnnotation.objects.create( canonical=canonical, platform_id=platform_id, created_by_id=user_id, **project( data, ['series_id', 'tag_id', 'column', 'regex', 'note', 'from_api' ])) raw_annotation.fill_samples(annotations) ValidationJob.objects.create(annotation=canonical)
def extract_map(name_map, names): lookup_root = fn.merge(*({v: k for v in vals} for k, vals in name_map)) mapping = fn.group_by(lambda x: lookup_root[x.split('##time_')[0]], names) mapping = fn.walk_values(tuple, mapping) # Make hashable. return frozenset(mapping.items())
def _push( repo, git_remote: str, refs: Iterable["ExpRefInfo"], force: bool, ) -> Mapping[SyncStatus, List["ExpRefInfo"]]: from scmrepo.exceptions import AuthError from ...scm import GitAuthError refspec_list = [f"{exp_ref}:{exp_ref}" for exp_ref in refs] logger.debug(f"git push experiment '{refs}' -> '{git_remote}'") with TqdmGit(desc="Pushing git refs") as pbar: try: results: Mapping[str, SyncStatus] = repo.scm.push_refspecs( git_remote, refspec_list, force=force, progress=pbar.update_git, ) except AuthError as exc: raise GitAuthError(str(exc)) def group_result(refspec): return results[str(refspec)] pull_result: Mapping[SyncStatus, List["ExpRefInfo"]] = group_by( group_result, refs ) return pull_result
def buildDict(prefix, urlx): rd = jutils.getHttpCsv(urlx) rdict = {} wlist = [] for row in rd: cols = row.split(',') if len(cols) > 5: wlist.append(cols[5]) w2list = [i for i in wlist if len(i) >= CHAR_MIN and len(i) <= CHAR_MAX] #print jj(w2list[:50]) #pr = partListToDict(w2list,keyFuncFirstChar) pr = funcy.group_by(lambda x: x[0], w2list) c = Counter([i[0] for i in w2list]).most_common(MOST_LIMIT) # print jj(c[:24]) # print jj(pr[c[0][0]]) fmck = [i[0] for i in c] fmcv = [i[1] for i in c] # 256*8=2048 # 512*4=2048 wordlist = funcy.flatten([pr[x][:4] for x in fmck]) rdict['data'] = funcy.select_keys(lambda x: x in fmck, pr) rdict['meta'] = { 'source': urlx, 'wordlist': wordlist, 'firstMostCommonKey': fmck, 'firstMostCommonCount': fmcv, 'host': 'http://data.gov.tw', 'build': 'http://console.tw', 'script': 'https://github.com/y12studio/console-tw/tree/master/projects/datachart/', 'prefix': prefix, 'time': datetime.datetime.utcnow().isoformat() } return rdict
def vim_leave_pre(): ''' Remove all pyunite states in each tabpage ''' with restore(vim.current.window), restore(vim.current.tabpage): states_by_tab = fn.group_by(itemgetter('tabpage_from'), variables.states) # error(str(dict(states_by_tab))) for tabpage, states in states_by_tab.items(): change_tabpage(tabpage) map(remove_state, states)
def _normalize_options(self, query, options): options = DatatableOptions._normalize_options(self, query, options) filters = group_by(r'^(GSE|GPL|)', options['search'].split()) options['search'] = ''.join(filters.pop('', [])) options['filters'] = filters return options
def group_by_attr(attr: str, it: Iterable) -> dict: """ :param attr: attribute expected to be on any item in it :param it: any iterable :return: dict -> where any item x in it, will be in a bucket key of which is x.attr """ return dict(group_by(operator.attrgetter(attr), it))
def load_data(header): print(colored('Found %d data lines' % len(header), 'cyan')) line_groups = group_by(r'^!([^_]+)_', header) # Load series series_df = get_df_from_lines(line_groups['Series'], 'Series') assert len(series_df.index) == 1 missing = REQUIRED_SERIES_FIELDS - set(series_df.columns) if missing: cprint( 'Skip incomplete header: %s column%s missing' % (', '.join(sorted(missing)), 's' if len(missing) > 1 else ''), 'red') return gse_name = series_df['series_geo_accession'][0] # Skip multispcecies if '|\n|' in series_df['series_platform_taxid'][0]: cprint('Skip multispecies', 'red') return if series_df['series_platform_taxid'][0] != series_df[ 'series_sample_taxid'][0]: cprint('Skip sample-platform species mismatch', 'red') return # Check if series updated try: old_last_update = Series.objects.get( gse_name=gse_name).attrs.get('last_update_date') except Series.DoesNotExist: old_last_update = None new_last_update = series_df['series_last_update_date'][0] if new_last_update == old_last_update: print( colored('%s not changed since %s' % (gse_name, old_last_update), 'yellow')) return else: print( colored( '%s updated %s -> %s' % (gse_name, old_last_update, new_last_update), 'green')) # Load samples try: samples_df = get_df_from_lines(line_groups['Sample'], 'Sample') except pd.errors.ParserError as e: cprint('Failed to parse sample lines: %s' % e, 'red') return samples_df['gsm_name'] = samples_df.sample_geo_accession samples_df = samples_df.set_index('gsm_name') insert_or_update_data(series_df, samples_df)
def iterable_per_line(triples): """Yield iterables of (key, value mapping), one for each line.""" # Jam all the triples of a file into a hash by line number: line_map = group_by(lambda (k, v, extent): extent.start.row, triples) # {line: triples} last_line = max(line_map.iterkeys()) + 1 if line_map else 1 # Pull out the needles for each line, stripping off the extents and # producing a blank list for missing lines. (The defaultdict returned from # group_by takes care of the latter.) return [[(k, v) for (k, v, e) in line_map[line_num]] for line_num in xrange(1, last_line)]
def iter_revs( scm: "Git", head_revs: Optional[List[str]] = None, num: int = 1, all_branches: bool = False, all_tags: bool = False, all_commits: bool = False, all_experiments: bool = False, ) -> Mapping[str, List[str]]: from dvc.repo.experiments.utils import fix_exp_head if num < 1 and num != -1: raise InvalidArgumentError(f"Invalid number of commits '{num}'") if not any( [head_revs, all_branches, all_tags, all_commits, all_experiments] ): return {} head_revs = head_revs or [] revs = [] for rev in head_revs: revs.append(rev) n = 1 while True: if num == n: break try: head = fix_exp_head(scm, f"{rev}~{n}") assert head revs.append(resolve_rev(scm, head)) except RevError: break n += 1 if all_commits: revs.extend(scm.list_all_commits()) else: if all_branches: revs.extend(scm.list_branches()) if all_tags: revs.extend(scm.list_tags()) if all_experiments: from dvc.repo.experiments.utils import exp_commits revs.extend(exp_commits(scm)) rev_resolver = partial(resolve_rev, scm) return group_by(rev_resolver, revs)
def sha1_copies(files): """ Find files identical sha1. Args: files(Iterable[PurePath]): Iterable of files. Yields: list: lists of files identical by sha1 """ file_hashes = group_by(get_file_sha1, files) for sha1, copy_files in file_hashes.items(): if len(copy_files) > 1: yield copy_files
def latest_instances(self): """Group instances by their component and and return latest within each group.""" component_lookup = {component.object_id: component for component in self.objects("COMPONENT")} grouped_instances = funcy.group_by(attrgetter("component_parent"), self.objects("INSTANCE")) logger.info( "Pipeline '%s' has %d components and %d instances, looking for latest instances", self.pipeline_id, len(grouped_instances), sum(map(len, grouped_instances.values())), ) for component_parent in sorted(grouped_instances): latest_instance = sorted(grouped_instances[component_parent], key=attrgetter("scheduled_start_time"))[-1] latest_instance.parent_object = component_lookup[component_parent] yield latest_instance
def query_dnf(query): def table_for(alias): if alias == main_alias: return alias return query.alias_map[alias].table_name dnf = _dnf(query.where) # NOTE: we exclude content_type as it never changes and will hold dead invalidation info main_alias = query.model._meta.db_table aliases = {alias for alias, join in query.alias_map.items() if query.alias_refcount[alias] and table_tracked(join.table_name)} \ | {main_alias} - {'django_content_type'} tables = group_by(table_for, aliases) return {table: clean_dnf(dnf, table_aliases) for table, table_aliases in tables.items()}
def query_dnf(query): def table_for(alias): if alias == main_alias: return alias return query.alias_map[alias].table_name dnf = _dnf(query.where) # NOTE: we exclude content_type as it never changes and will hold dead invalidation info main_alias = query.model._meta.db_table aliases = {alias for alias, (join, cnt) in zip_dicts(query.alias_map, query.alias_refcount) if cnt and family_has_profile(table_to_model(join.table_name))} \ | {main_alias} - {'django_content_type'} tables = group_by(table_for, aliases) return {table: clean_dnf(dnf, table_aliases) for table, table_aliases in tables.items()}
def bounding_box(r: Rec, oracle): """Compute Bounding box. TODO: clean up""" recs = list(box_edges(r)) tops = [(binsearch(r2, oracle)[1].top, tuple((np.array(r2.top) - np.array(r2.bot) != 0))) for r2 in recs] tops = fn.group_by(ig(1), tops) def _top_components(): for key, vals in tops.items(): idx = key.index(True) yield max(v[0][idx] for v in vals) top = np.array(list(_top_components())) intervals = tuple(zip(r.bot, top)) return to_rec(intervals=intervals)
def group_needles(line_needles): """Group line needles by line, and return a list of needles for each line, up to the last line with any needles:: [(a, 1), (b, 4), (c, 4)] -> [[a], [], [], [b, c]] """ # Jam all the needles of a file into a hash by line number: line_map = group_by(itemgetter(1), line_needles) # {line: needles} last_line = max(line_map.iterkeys()) + 1 if line_map else 1 # Pull out the needles for each line, stripping off the line number # elements of the tuples and producing a blank list for missing lines. # (The defaultdict returned from group_by takes care of the latter.) return [[pair for (pair, _) in line_map[line_num]] for line_num in xrange(1, last_line)]
def iter_revs( scm: "Git", revs: Optional[List[str]] = None, num: int = 1, all_branches: bool = False, all_tags: bool = False, all_commits: bool = False, all_experiments: bool = False, ) -> Mapping[str, List[str]]: if not any([revs, all_branches, all_tags, all_commits, all_experiments]): return {} revs = revs or [] results = [] for rev in revs: if num == 0: continue results.append(rev) n = 1 while True: if num == n: break try: head = f"{rev}~{n}" results.append(resolve_rev(scm, head)) except RevError: break n += 1 if all_commits: results.extend(scm.list_all_commits()) else: if all_branches: results.extend(scm.list_branches()) if all_tags: results.extend(scm.list_tags()) if all_experiments: from dvc.repo.experiments.utils import exp_commits results.extend(exp_commits(scm)) rev_resolver = partial(resolve_rev, scm) return group_by(rev_resolver, results)
def get_mbta_station_info(cfg): route_info = query_mbta_id("routes", cfg['route']) stop_info = query_mbta_id("stops", cfg['stop']) params = (('filter[stop]', cfg['stop']), ('filter[route]', cfg['route']), ('page[limit]', '10')) arrivals = query_mbta('predictions', params) by_direction = f.walk_values( vectorize(f.compose(relative_ts, op.itemgetter('arrival_time'))), f.group_by(op.itemgetter('direction_id'), arrivals)) return [ f.merge( { "station": stop_info['name'], "route": cfg['route'], "direction": route_info['direction_destinations'][k], }, dict(zip(range(5), pad(v, 5)))) for k, v in by_direction.items() ]
def _normalize_options(self, query, options): """ Here we parse some search tokens diffrently to enable filtering: GSE\d+ and GPL\d+ filter by specific serie or platform tag=\w+ filters by tag valid selects validated annotations """ options = DatatableOptions._normalize_options(self, query, options) # Try normally named field if not options['search']: options['search'] = query.get('search', '').strip() filters = group_by(r'^(GSE|GPL|[Tt]ag=|valid|novalid)', options['search'].split()) options['search'] = ' '.join(filters.pop(None, [])) filters = walk_keys(str.lower, filters) filters['tag'] = lmap(r'^[Tt]ag=(.*)', filters.pop('tag=', [])) options['filters'] = filters return options
def load_data(header): print colored('Found %d data lines' % len(header), 'cyan') line_groups = group_by(r'^!([^_]+)_', header) # Load series series_df = get_df_from_lines(line_groups['Series'], 'Series') assert len(series_df.index) == 1 gse_name = series_df['series_geo_accession'][0] # Skip multispcecies if '|\n|' in series_df['series_platform_taxid'][0]: cprint('Skip multispecies', 'red') return if series_df['series_platform_taxid'][0] != series_df[ 'series_sample_taxid'][0]: cprint('Skip sample-platform species mismatch', 'red') return # Check if series updated try: old_last_update = Series.objects.get( gse_name=gse_name).attrs.get('last_update_date') except Series.DoesNotExist: old_last_update = None new_last_update = series_df['series_last_update_date'][0] if new_last_update == old_last_update: print colored('%s not changed since %s' % (gse_name, old_last_update), 'yellow') return else: print colored( '%s updated %s -> %s' % (gse_name, old_last_update, new_last_update), 'green') # Load samples samples_df = get_df_from_lines(line_groups['Sample'], 'Sample') samples_df['gsm_name'] = samples_df.sample_geo_accession samples_df = samples_df.set_index('gsm_name') insert_or_update_data(series_df, samples_df)
def draw_graph_animation(graph): vertices_names = graph.new_vertex_property('string') graph.vertex_properties['vertices_names'] = vertices_names for vertex in graph.vertices(): vertices_names[vertex] = \ graph.vertex_properties['actors_on_vertices'][vertex] + \ ' ' + str(graph.vertex_properties['pagerank'][vertex]) graph.vertex_properties['pos'] = sfdp_layout( graph, eweight=graph.edge_properties['weights_on_edges']) dir_name = 'pagerank/' + \ graph.graph_properties['repo_on_graph'].replace('/', '%') + '/' os.mkdir(dir_name) def event_bulk(vertex): event = graph.vertex_properties['events_on_vertices'][vertex] return event['created_at'].strftime("%Y-%m-%d %H") batch_sizes = map( lambda x: len(x[1]), sorted(group_by(event_bulk, graph.vertices()).items(), key=lambda x: x[0])) def tail_number(n): if n == 0: return batch_sizes[0] else: return tail_number(n - 1) + batch_sizes[n] batch_numbers = map(tail_number, range(len(batch_sizes))) map(draw_graph_frame, map(lambda x: (graph, dir_name, x), batch_numbers)) images = [Image.open(dir_name + str(i) + '.png') for i in batch_numbers] writeGif(dir_name + 'animation.gif', images, duration=0.1)
def draw_graph_animation(graph): vertices_names = graph.new_vertex_property('string') graph.vertex_properties['vertices_names'] = vertices_names for vertex in graph.vertices(): vertices_names[vertex] = \ graph.vertex_properties['actors_on_vertices'][vertex] + \ ' ' + str(graph.vertex_properties['pagerank'][vertex]) graph.vertex_properties['pos'] = sfdp_layout( graph, eweight=graph.edge_properties['weights_on_edges']) dir_name = 'pagerank/' + \ graph.graph_properties['repo_on_graph'].replace('/', '%') + '/' os.mkdir(dir_name) def event_bulk(vertex): event = graph.vertex_properties['events_on_vertices'][vertex] return event['created_at'].strftime("%Y-%m-%d %H") batch_sizes = map(lambda x: len(x[1]), sorted(group_by( event_bulk, graph.vertices()).items(), key=lambda x: x[0])) def tail_number(n): if n == 0: return batch_sizes[0] else: return tail_number(n - 1) + batch_sizes[n] batch_numbers = map(tail_number, range(len(batch_sizes))) map(draw_graph_frame, map( lambda x: (graph, dir_name, x), batch_numbers)) images = [Image.open(dir_name + str(i) + '.png') for i in batch_numbers] writeGif(dir_name + 'animation.gif', images, duration=0.1)
def handle(self, **options): SeriesAnnotation.objects.filter(annotations__gt=0).update(is_active=True) qs = RawSeriesAnnotation.objects.order_by('id') by_canonical = group_by(lambda a: a.canonical_id, qs) for anno in tqdm(qs): if anno.ignored or anno.by_incompetent: continue last_anno = by_canonical[anno.canonical_id][-1] if not samples_match(anno, last_anno): anno.is_active = False anno.obsolete = True anno.save() else: anno.obsolete = False try: anno.is_active = True anno.save() except IntegrityError: anno.is_active = False anno.note += '# dup' anno.save()
def find_copies(path_to_dir, delete): dir_path = Path(path_to_dir) dir_iter = recursion_finder(dir_path) # group all files by size and filter 0-sized files file_sizes = group_by(get_file_size, dir_iter) file_sizes = select_keys(None, file_sizes) # get groups of files and filter one-members groups files_groups = filter(not_alone_item, file_sizes.values()) # view copies grouped by sha1 for copies in sha1_copies_from_groups(files_groups): show_list_files(copies, TEXTS['identical_files']) if not delete: continue # wait user input nums = get_nums_for_delete(copies) # if users choice not delete files if 0 in nums: continue files_to_delete = [copies[num - 1] for num in nums] # show files to delete show_list_files(files_to_delete, TEXTS['delete_list']) if click.confirm(TEXTS['confirm']): delete_files(files_to_delete) click.echo(TEXTS['delete_success']) else: click.echo(TEXTS['delete_aborted']) click.echo('=' * 20)
def _pull( repo, git_remote: str, refs: Iterable["ExpRefInfo"], force: bool, ) -> Mapping[SyncStatus, List["ExpRefInfo"]]: refspec_list = [f"{exp_ref}:{exp_ref}" for exp_ref in refs] logger.debug(f"git pull experiment '{git_remote}' -> '{refspec_list}'") with TqdmGit(desc="Fetching git refs") as pbar: results: Mapping[str, SyncStatus] = repo.scm.fetch_refspecs( git_remote, refspec_list, force=force, progress=pbar.update_git, ) def group_result(refspec): return results[str(refspec)] pull_result: Mapping[SyncStatus, List["ExpRefInfo"]] = group_by(group_result, refs) return pull_result
def categorize(ast): """Group ast nodes based on their type.""" return group_by(_categorize, ast.walk_down())
def calc_validation_stats(serie_validation_pk, recalc=False): serie_validation = SerieValidation.objects.select_for_update().get( pk=serie_validation_pk) # Guard from double update, so that user stats won't be messed up if not recalc and serie_validation.samples_total is not None: return series_tag = serie_validation.series_tag if not series_tag: return # Compare to annotation sample_validations = serie_validation.sample_validations.all() sample_annotations = series_tag.sample_tags.all() if set(r.sample_id for r in sample_validations) \ != set(r.sample_id for r in sample_annotations): logger.error("Sample sets mismatch for validation %d" % serie_validation_pk) # It's either bug when making annotation or samples set really changed series_tag.is_active = False series_tag.save() # TODO: notify annotation author to redo it return _fill_concordancy(sample_validations, sample_annotations) # Fill serie validation stats serie_validation.samples_total = len(sample_validations) serie_validation.samples_concordant = sum(s.concordant for s in sample_validations) serie_validation.annotation_kappa = _cohens_kappa(sample_validations, sample_annotations) # Compare to other validations earlier_validations = series_tag.validations.filter(pk__lt=serie_validation_pk, ignored=False) \ .order_by('pk') # TODO: use .prefetch_related() earlier_sample_validations = group_by( lambda v: v.serie_validation_id, SampleValidation.objects.filter( serie_validation__in=earlier_validations)) if not serie_validation.concordant: serie_validation.agrees_with = first( v for v in earlier_validations if v.created_by_id != serie_validation.created_by_id and is_samples_concordant(earlier_sample_validations[v.pk], sample_validations)) # NOTE: this includes kappas against your prev validations serie_validation.best_kappa = max( chain([serie_validation.annotation_kappa], (_cohens_kappa(sample_validations, sv) for sv in earlier_sample_validations.values()))) serie_validation.save() # Calculate fleiss kappa for all existing annotations/validations annotation_sets = [sample_annotations, sample_validations] \ + earlier_sample_validations.values() series_tag.fleiss_kappa = _fleiss_kappa(annotation_sets) if not serie_validation.on_demand and not serie_validation.ignored \ and (serie_validation.concordant or serie_validation.agrees_with): series_tag.agreed = earlier_validations.count() + 1 series_tag.save() # TODO: make this separate task ? if not recalc and not serie_validation.on_demand and not serie_validation.by_incompetent: _update_user_stats(serie_validation) # including payment ones # TODO: make this separate task ? # Reschedule validation if no agreement found if not series_tag.agreed and not recalc and not serie_validation.on_demand \ and not serie_validation.by_incompetent: # Schedule revalidations with priority < 0, that's what new validations have, # to phase out garbage earlier _reschedule_validation(serie_validation, priority=series_tag.fleiss_kappa - 1)
def group(events): return group_by(lambda x: x['repo'], events)
def iuniq(func, lst): return imap(fn.first, fn.group_by(func, lst).itervalues())
def modulo_group(n: int, seq: Iterable) -> List[List[Any]]: grouped = group_by(lambda l: l[0] % n, enumerate(seq)) return lmap(lambda l: lmap(lambda t: t[1], l), map(lambda g: grouped[g], range(0, n)))
def summarize_evaluation(eval_dir, selection_metric="val_accuracy", ignore_worst=0): if not eval_dir.exists(): print(f"No evalutation '{eval_dir}' found.") return with open(eval_dir / "config.json") as f: config = json.load(f) with open(eval_dir / "hyperparams.json") as f: hps = json.load(f) results_dir = eval_dir / "results" assert results_dir.exists(), f"No results found for '{eval_dir}'." summary_dir = eval_dir / "summary" if not summary_dir.exists(): os.makedirs(summary_dir) result_files = [(list(fy.map(int, f[:-5].split("-"))), results_dir / f) for f in os.listdir(results_dir)] fold_files = fy.group_by(lambda f: f[0][0], result_files) fold_param_files = { fold: fy.group_by(lambda f: f[0][1], files) for fold, files in fold_files.items() } folds = list(fold_param_files.items()) folds.sort(key=fy.first) best_goal = selection_metrics[selection_metric] results = [] all_hps = True for fold_i, param_files in folds: best_res = None param_file_items = list(param_files.items()) all_hps = all_hps and len(param_files) == len(hps) for hp_i, files in param_file_items: hp_train_results = defaultdict(list) hp_test_results = defaultdict(list) selection_vals = [] all_selection_vals = [] for (_, _, i), file in files: with open(file, "r") as f: result = json.load(f) selection_val = result["train"][selection_metric][-1] all_selection_vals.append(selection_val) if i < config["repeat"]: selection_vals.append(selection_val) for metric, val in result["train"].items(): hp_train_results[metric].append(val[-1]) for metric, val in result["test"].items(): hp_test_results[metric].append(val) top_idxs = np.argsort(np.array(all_selection_vals)) if len(all_selection_vals) > ignore_worst: if best_goal == "max": top_idxs = top_idxs[ignore_worst:] elif best_goal == "min": top_idxs = top_idxs[:-ignore_worst] top_statistics = fy.compose(statistics, lambda l: np.array(l)[top_idxs]) hp_res = dict(fold_idx=fold_i, train=dict_map(top_statistics, hp_train_results), test=dict_map(top_statistics, hp_test_results), select=np.mean(selection_vals), hp_i=hp_i, hp=hps[hp_i], select_repeats=len(selection_vals), eval_repeats=len(files)) if (best_res is None or (best_goal == "max" and best_res["select"] < hp_res["select"]) or (best_goal == "min" and best_res["select"] > hp_res["select"]) or (best_res["select"] == hp_res["select"] and best_res["eval_repeats"] < hp_res["eval_repeats"])): best_res = hp_res if best_res is not None: results.append(best_res) else: print(f"No results for {fold_i}.") combined_train = dict_map( statistics, fy.merge_with( np.array, *map(lambda res: dict_map(lambda t: t["mean"], res["train"]), results))) combined_test = dict_map( statistics, fy.merge_with( np.array, *map(lambda res: dict_map(lambda t: t["mean"], res["test"]), results))) results_summary = { "folds": results, "combined_train": combined_train, "combined_test": combined_test, "args": { "ignore_worst": ignore_worst }, "done": all_hps and len(folds) == 10 } with open(summary_dir / "results.json", "w") as f: json.dump(results_summary, f, cls=NumpyEncoder, indent="\t") return results_summary
def transform(ast): return group_by(categorize, ast.walk_down())
def export_twitterUser_emotion_analysis(db='UserPost',collection="user_post"): client = MongoClient() db_tweets = client['%s' % db] collect_tweets = db_tweets['%s' % collection] db_user = client['Twitter'] collect_user = db_user['twitter'] from funcy import flatten,concat,group_by # 根据现有的文章提取出用户 pipline = [ {"$match": { "site": "twitter" }}, {"$group": { "_id": "$user.id_str", "count": {"$sum": 1} }} ] result = list(collect_tweets.aggregate(pipline)) formatDocs = [] for id in list(map(lambda x: x['_id'], result)): # 查找该永和的用户信息 user_for_id = collect_user.find_one({'id_str': id}) # 查找该用户下的所有文章 user_for_id_tweets_count = collect_tweets.count({"user.id_str": id, "site": 'twitter'}) # print(user_for_id_tweets_count) if (user_for_id_tweets_count > 0): aggregate_for_user_tweets = collect_tweets.aggregate([ { "$match": { "user.id_str": id, "site": 'twitter' } }, {"$group": { "_id": "$user.id_str", "text":{"$push":"$text"} }} ]) user_tweets_texts = list(aggregate_for_user_tweets)[0] # print(len(user_tweets_texts['text'])) # print(texts) if len(user_tweets_texts['text'])>300: ops = [{'url':'https://tone-analyzer-demo.ng.bluemix.net/api/tone','data':''.join(user_tweets_texts['text'][i:i+300])} for i in range(0,len(user_tweets_texts['text']),300)] else: texts = ''.join(user_tweets_texts['text']) ops = [{'url':'https://tone-analyzer-demo.ng.bluemix.net/api/tone','data':texts}] # print(ops) analyzer = asynchronous_request_facebook_api(ops) # print(analyzer[0]) final_result = list(concat(list(flatten(list(map(lambda x:x['document_tone']['tones'],analyzer)))))) group_result = group_by(lambda x:x['tone_name'],final_result) formatDocs.append({}) print(len(formatDocs)) else: print(id) df2 = pd.DataFrame(formatDocs) df2 = df2.applymap(lambda x: x.encode('unicode_escape'). decode('utf-8') if isinstance(x, str) else x) # print(docs) df2.to_excel('./export_data/%s/user_summary/%s.xlsx' % ("twitter", "twitter_user_summary"), sheet_name='Sheet1')
_MONTHS = [ 'январь', 'февраль', 'март', 'апрель', 'май', 'июнь', 'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь' ] # предметы _SUBJECTS = [ 'английский', 'алгебра', 'wolfram mathematica', 'мат. анализ', 'дискр. мат', 'диффуры' ] # фото pic_path = os.listdir(path='pictures') all_pic = [ list(map(lambda x: 'pictures/' + x, i)) for i in list(group_by(0, pic_path).values()) ] pic_category = ['hello', 'bye', 'level'] _PICTURES = dict(zip(pic_category, all_pic)) # дни рождения with open('data/birth.txt', 'r', encoding='utf-8') as birthdays: birthdays_list = birthdays.readlines() # фильмы with open('data/films.txt', 'r', encoding='utf-8') as films: film_list = films.readlines() # олипипиадные задачки with open('data/olimp.txt', 'r', encoding='utf-8') as olimp: olimp_list = olimp.readlines()
def group_by_attr(attr: str, it: Iterable): return dict(group_by(operator.attrgetter(attr), it))