async def _req_frags_ngramm_ngramm(request: web.Request) -> web.StreamResponse: """Кросс-распределение «5 фрагментов» - «фразы из контекстов цитирований»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) if not topn: topn = 10 nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) n_gramms = mdb.n_gramms topN = await _get_topn_ngramm( n_gramms, nka, ltype, topn, title_always_id=True, show_type=True) exists = frozenset(t for t, *_ in topN) pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'linked_papers_topics': 0, 'bundles': 0}}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'cont'}}, {'$unwind': '$cont'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'cont')] pipeline += [ {'$unwind': '$linked_papers_ngrams'}, {'$match': {'$expr': {'$eq': ['$linked_papers_ngrams._id', '$cont._id']}}}, ] out_list = [] contexts = mdb.contexts for i, (ngrmm, typ_, cnt, conts) in enumerate(topN, 1): congr = defaultdict(Counter) titles = {} types = {} work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}} ] + pipeline # _logger.debug('ngrmm: "%s", cnt: %s, pipeline: %s', ngrmm, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['cont'] ngr_id = cont['_id'] if ngr_id not in exists: continue fnum = doc['frag_num'] congr[ngr_id][fnum] += doc['linked_papers_ngrams']['cnt'] titles[ngr_id] = cont['title'] # if not ltype: types[ngr_id] = cont['type'] frags = congr.pop(ngrmm) crossgrams = [] otype = ltype if ltype else types[ngrmm] # out_list[ngrmm] = dict(sum=cnt, frags=frags, crossgrams=crossgrams) out_list.append( dict( title=titles[ngrmm], type=otype, sum=cnt, cnt_cross=len(congr), frags=frags, crossgrams=crossgrams)) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crossgrams.append( dict( title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values()))) return json_response(out_list)
async def _req_frags_ngramm(request: web.Request) -> web.StreamResponse: """Распределение «5 фрагментов» - «фразы из контекстов цитирований»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) if not topn: topn = 10 nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) pipeline = [ {'$match': { 'frag_num': {'$exists': 1}, 'linked_papers_ngrams': {'$exists': 1}}}, {'$project': { '_id': 1, 'frag_num': 1, 'linked_paper': '$linked_papers_ngrams'}}, {'$unwind': '$linked_paper'}, {'$group': { '_id': {'_id': '$linked_paper._id', 'frag_num': '$frag_num'}, 'count': {'$sum': '$linked_paper.cnt'},}}, {'$group': { '_id': '$_id._id', 'count': {'$sum': '$count'}, 'frags': {'$push': {'frag_num': '$_id.frag_num', 'count': '$count',}},}}, {'$sort': {'count': -1, '_id': 1}}, {'$lookup': { 'from': 'n_gramms', 'localField': '_id', 'foreignField': '_id', 'as': 'ngramm'}}, {'$unwind': '$ngramm'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'ngramm')] pipeline += [ {'$project': { 'title': '$ngramm.title', 'type': '$ngramm.type', 'nka': '$ngramm.nka', 'count': '$count', 'frags': '$frags'}}] if topn: pipeline += [{'$limit': topn}] _logger.debug('pipeline: %s', pipeline) contexts = mdb.contexts out_dict = {} async for doc in contexts.aggregate(pipeline): title = doc['title'] cnt = doc['count'] frags = {n: 0 for n in range(1, 6)} frags.update(map(itemgetter('frag_num', 'count'), doc['frags'])) dtype = doc['type'] out = dict(sum=cnt, frags=frags) if not nka: out.update(nka=doc['nka']) if ltype: out_dict[title] = out else: out.update(type = dtype, title=title) did = doc['_id'] out_dict[did] = out return json_response(out_dict)
async def _req_publ_publications_ngramms( request: web.Request ) -> web.StreamResponse: """Кросс-распределение «фразы из контекстов цитирований» по публикациям""" app = request.app mdb = app['db'] publications = mdb.publications pubs = { pdoc['_id']: pdoc['name'] async for pdoc in publications.find({'name': {'$exists': True}}).sort('_id') } topn = getreqarg_topn(request) if not topn: topn = 10 nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) if nka or ltype: postmath = [ {'$match': { f: v for f, v in (('cont.nka', nka), ('cont.type', ltype)) if v}}] else: postmath = None pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'linked_papers_topics': 0, 'positive_negative': 0, 'bundles': 0},}, {'$unwind': '$linked_papers_ngrams'}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'cont'}}, {'$unwind': '$cont'}, ] if postmath: pipeline += postmath contexts = mdb.contexts n_gramms = mdb.n_gramms out_pub_list = [] for pub_id, pub_desc in pubs.items(): topN = await _get_topn_ngramm( n_gramms, nka, ltype, topn, pub_id=pub_id, title_always_id=True, show_type=True) exists = frozenset(map(itemgetter(0), topN)) out_list = [] oconts = set() for i, (ngrmm, ntype, cnt, conts) in enumerate(topN, 1): congr = defaultdict(set) ngrms = {} work_pipeline = [ {'$match': {'_id': {'$in': conts}, 'pub_id': pub_id}} ] + pipeline + [ {'$match': {'cont.type': ntype}} ] # _logger.debug('pipeline: %s', work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['cont'] ngr_id = cont['_id'] ngr = cont['title'] if ngr_id not in exists: continue cid = doc['_id'] oconts.add(cid) congr[ngr_id].add(cid) ngrms[ngr_id] = dict(type=cont['type'], title=ngr, nka=cont['nka']) pubs = congr.pop(ngrmm) b_ngrm = ngrms.pop(ngrmm) crossgrams = [] for j, (co, vals) in enumerate( sorted(congr.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1 ): co_ = ngrms[co] crossgrams.append( dict(type=co_['type'], title=co_['title'], conts_len=len(vals))) out_list.append(dict( type=b_ngrm['type'], title=b_ngrm['title'], nka=b_ngrm['nka'], conts=tuple(sorted(pubs)), conts_len=len(pubs), crossgrams=crossgrams, crossgrams_len=len(crossgrams))) out_pub_list.append(dict( pub_id=pub_id, descr=pub_desc, ngrams=out_list, ngrams_len=len(out_list), conts=tuple(sorted(oconts)), conts_len=len(oconts))) return json_response(out_pub_list)
async def _req_frags_cocitauthors_ngramms(request: web.Request) -> web.StreamResponse: """Б Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»""" app = request.app mdb = app['db'] topn:int = getreqarg_topn(request) # if not topn: # topn = 10 topn_gramm:int = getreqarg_int(request, 'topn_gramm') if not topn_gramm: topn_gramm = 500 nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) if topn_gramm: n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn_gramm, title_always_id=True) exists = frozenset(t for t, _, _ in top_ngramms) else: exists = () contexts = mdb.contexts topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True) pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_topics': 0}}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'cont'}}, {'$unwind': '$cont'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'cont')] pipeline += [ {'$unwind': '$linked_papers_ngrams'}, {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_ngrams._id']}}}, ] out_dict = {} for i, (cocitauthor, cnt, conts) in enumerate(topN, 1): frags = Counter() congr = defaultdict(Counter) titles = {} types = {} work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, ] + pipeline # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['cont'] ngr_title = cont['title'] ngr_id = cont['_id'] if topn_gramm and ngr_id not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 ngr_cnt = doc['linked_papers_ngrams']['cnt'] congr[ngr_id][fnum] += ngr_cnt titles[ngr_id] = ngr_title types[ngr_id] = cont['type'] crossgrams = [] out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crossgrams=crossgrams) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): ngr = dict( title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values())) crossgrams.append(ngr) return json_response(out_dict)
async def _req_frags_topics_ngramms(request: web.Request) -> web.StreamResponse: """Кросс-распределение «топики» - «фразы»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) topn_crpssgramm: int = getreqarg_int(request, 'topn_crpssgramm') topn_gramm: int = getreqarg_int(request, 'topn_gramm') if not topn_gramm: topn_gramm = 500 nka = getreqarg_nka(request) ltype = getreqarg_ltype(request) if topn_gramm: n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn_gramm, title_always_id=True) exists = frozenset(t for t, _, _ in top_ngramms) else: exists = () pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_topics': 0}}, {'$unwind': '$linked_papers_ngrams'}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'ngrm'}}, {'$unwind': '$ngrm'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'ngrm')] pipeline += [ {'$sort': {'ngrm.count_in_linked_papers': -1, 'ngrm.count_all': -1}}, ] top_topics = await _get_topn_topics(mdb.topics, topn) contexts = mdb.contexts out_dict = {} zerro_frags = {n: 0 for n in range(1, 6)} for i, (topic, cnt, conts) in enumerate(top_topics, 1): frags = Counter(zerro_frags) congr = defaultdict(partial(Counter, zerro_frags)) work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, ] + pipeline # _logger.debug('topic: "%s", cnt: %s, pipeline: %s', topic, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['ngrm'] ngr = cont['title'] if exists and cont['_id'] not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 if topn_crpssgramm and len(congr) == topn_crpssgramm: break crossgrams = {} out_dict[topic] = dict(sum=cnt, frags=frags, crossgrams=crossgrams) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values())) return json_response(out_dict)