예제 #1
0
async def _req_frags_pubs(
  request: web.Request
) -> web.StreamResponse:
  """
  А
  Распределение цитирований по 5-ти фрагментам для отдельных публикаций. #заданного автора.
  """
  app = request.app
  mdb = app['db']
  publications = mdb.publications
  pubs = {
    pdoc['_id']: (pdoc['name'], Counter())
    async for pdoc in publications.find({'name': {'$exists': True}})
  }
  contexts = mdb.contexts
  async for doc in contexts.aggregate([
    {'$match': {'frag_num': {'$gt': 0}}},
    {'$group': {
      '_id': {'fn': '$frag_num', 'pub_id': '$pub_id'}, 'count': {'$sum': 1}}},
    {'$sort': {'_id': 1}}
  ]):
    did = doc['_id']
    pub_id = did['pub_id']
    frn = did['fn']
    pubs[pub_id][1][frn] += int(doc['count'])

  out_pubs = {}
  for i, (pid, pub) in enumerate(
    sorted(pubs.items(), key=lambda kv: sum(kv[1][1].values()), reverse=True),
    1
  ):
    pcnts = pub[1]
    out_pubs[pid] = dict(descr=pub[0], sum=sum(pcnts.values()), frags=pcnts)

  return json_response(out_pubs)
예제 #2
0
async def _req_frags_cocitauthors(request: web.Request) -> web.StreamResponse:
  """
  А
  Распределение «со-цитируемые авторы» по 5-ти фрагментам
  """
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  contexts = mdb.contexts

  pipeline = [
    {'$match': {'frag_num': {'$gt': 0}, 'cocit_authors': {'$exists': True}}},
    {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
    {'$unwind': '$cocit_authors'},
    {'$group': {
        '_id': '$cocit_authors', 'count': {'$sum': 1},
        'frags': {'$push': '$frag_num'}}},
    {'$sort': {'count': -1, '_id': 1}},
  ]
  if topn:
    pipeline += [{'$limit': topn},]
  out_dict = {}
  async for doc in contexts.aggregate(pipeline):
    frags = Counter(doc['frags'])
    out_dict[doc['_id']] = dict(sum=doc['count'], frags=frags)

  return json_response(out_dict)
예제 #3
0
async def _req_frags_ngramms_cocitauthors(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «фразы» - «со-цитирования»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  contexts = mdb.contexts

  topn_authors:int = getreqarg_int(request, 'topn_cocitauthors')
  if topn_authors:
    topNa = await _get_topn_cocit_authors(
      contexts, topn_authors, include_conts=False)
    exists = frozenset(t for t, _ in topNa)
  else:
    exists = ()

  nka:int = getreqarg_int(request, 'nka')
  ltype:str = getreqarg(request, 'ltype')

  n_gramms = mdb.n_gramms
  top_ngramms = await _get_topn_ngramm(
    n_gramms, nka, ltype, topn, title_always_id=True, show_type=True)

  out_dict = []
  for i, (ngramm, typ_, cnt, conts) in enumerate(top_ngramms, 1):
    frags = Counter()
    congr = defaultdict(Counter)
    cnt = 0

    async for doc in contexts.aggregate([
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
      {'$unwind': '$cocit_authors'},
    ]):
      ngr = doc['cocit_authors']
      if topn_authors and ngr not in exists:
        continue

      cnt += 1
      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1

    if cnt:
      crosscocitaith = {}
      # out_dict[ngramm] = dict(
      #   sum=cnt, frags=frags, cocitaithors=crosscocitaith)
      out_dict.append(dict(
        title=ngramm.split('_', 1)[-1], type=typ_,
        sum=cnt, frags=frags, cocitaithors=crosscocitaith))

      for j, (co, cnts) in enumerate(
        sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
      ):
        crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
예제 #4
0
async def _reg_cnt_ngramm(request: web.Request) -> web.StreamResponse:
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  pipeline += [{'$sort': {'count_all': -1, 'title': 1, 'type': 1}}]
  if topn:
    pipeline += [{'$limit': topn}]

  out = []
  get_as_tuple = itemgetter('title', 'type', 'linked_papers')
  n_gramms = mdb.n_gramms
  async for doc in n_gramms.aggregate(pipeline):
    title, lt, conts = get_as_tuple(doc)
    res = dict(title=title) if ltype else dict(title=title, type=lt)
    cnt_all = cnt_cont = 0
    pubs = set()
    for cid, cnt in (c.values() for c in conts):
      cnt_cont += 1
      cnt_all += cnt
      pubs.add(cid.rsplit('@', 1)[0])
    res.update(
      count_all=doc['count_all'], count=cnt_all, count_conts=cnt_cont,
      conts_pubs=len(pubs))
    out.append(res)

  return json_response(out)
예제 #5
0
async def _req_frags_cocitauthors_topics(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «со-цитирования» - «топики контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn:int = getreqarg_topn(request)
  topn_topics:int = getreqarg_int(request, 'topn_topics')
  if topn_topics:
    topics = mdb.topics
    top_topics = await _get_topn_topics(topics, topn=topn)
    exists = frozenset(t for t, _, _ in top_topics)
  else:
    exists = ()

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True)

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_ngrams': 0, 'cocit_authors': 0}},
    {'$unwind': '$linked_papers_topics'},
    {'$lookup': {
      'from': 'topics', 'localField': 'linked_papers_topics._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
    {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_topics._id']}}},
  ]

  out_dict = {}
  for i, (cocitauthor, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] + pipeline
    # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      topic = cont['title']
      if topn_topics and topic not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[topic][fnum] += 1

    crosstopics = {}
    out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crosstopics=crosstopics)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
예제 #6
0
async def _req_frags_ngramms_topics(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «фразы» - «топики контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  topn_topics:int = getreqarg_int(request, 'topn_topics')
  if topn_topics:
    topics = mdb.topics
    topNt = await _get_topn_topics(topics, topn=topn_topics)
    exists = frozenset(t for t, _, _ in topNt)
  else:
    exists = ()

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  n_gramms = mdb.n_gramms
  top_ngramms = await _get_topn_ngramm(
    n_gramms, nka, ltype, topn, title_always_id=True, show_type=True)

  contexts = mdb.contexts

  out_dict = []
  zerro_frags = {n: 0 for n in range(1, 6)}
  for i, (ngrmm, typ_, cnt, conts) in enumerate(top_ngramms, 1):
    frags = Counter(zerro_frags)
    congr = defaultdict(partial(Counter, zerro_frags))

    pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {
        'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
        'bundles': 0, 'linked_papers_ngrams': 0}},
      {'$unwind': '$linked_papers_topics'},
    ]
    # _logger.debug('ngrmm: "%s", cnt: %s, pipeline: %s', ngrmm, cnt, pipeline)
    async for doc in contexts.aggregate(pipeline):
      cont = doc['linked_papers_topics']
      topic = cont['_id']
      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[topic][fnum] += 1

    crosstopics = {}
    out_dict.append(dict(
      title=ngrmm.split('_', 1)[-1], type=typ_, sum=cnt, frags=frags,
      crosstopics=crosstopics))

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
예제 #7
0
async def _req_top_topics(request: web.Request) -> web.StreamResponse:
  """Топ N топиков"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topics = mdb.topics
  topN = await _get_topn_topics(topics, topn=topn)

  out = tuple(dict(title=n, contects=conts) for n, _, conts in topN)
  return json_response(out)
예제 #8
0
async def _req_top_cocitauthors(request: web.Request) -> web.StreamResponse:
  """Топ N со-цитируемых авторов"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True)
  out = tuple(dict(title=n, cnt=cnt, contects=conts) for n, cnt, conts in topN)

  return json_response(out)
예제 #9
0
async def _req_publ_topics_topics(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «публикации» - «топики контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topics = mdb.topics
  topN = await _get_topn_topics(topics, topn=topn)

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_ngrams': 0}},
    {'$lookup': {
        'from': 'topics', 'localField': 'linked_papers_topics._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
    {'$unwind': '$linked_papers_topics'},
    {'$match': {'$expr': {'$eq': ['$linked_papers_topics._id', '$cont._id']}}},
  ]

  contexts = mdb.contexts
  out_list = []
  for i, (topic, cnt, conts) in enumerate(topN, 1):
    congr = defaultdict(set)

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] + pipeline
    # _logger.debug('topic: "%s", cnt: %s, pipeline: %s', topic, cnt, work_pipeline)
    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      ngr = cont['title']
      pub_id = doc['pub_id']
      congr[ngr].add(pub_id)

    pubs = congr.pop(topic)
    crosstopics = []
    out_list.append(
      dict(
        topic=topic, cnt_pubs=len(pubs), cnt_cross=len(congr),
        pubs=tuple(sorted(pubs)),
        crosstopics=crosstopics))


    for j, (co, vals) in enumerate(
      sorted(congr.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1
    ):
      crosstopics.append(
        dict(title=co, cnt=len(vals), pubs=tuple(sorted(vals))))

  return json_response(out_list)
예제 #10
0
async def _req_frags(request:web.Request) -> web.StreamResponse:
  """
  А
  Суммарное распределение цитирований по 5-ти фрагментам для всех публикаций
  """
  app = request.app
  mdb = app['db']
  contexts = mdb.contexts
  curs = contexts.aggregate([
    {'$match': {'frag_num': {'$gt': 0}}},
    {'$group': {'_id': '$frag_num', 'count': {'$sum': 1}}},
    {'$sort': {'_id': 1}}])
  cnts = Counter({doc['_id']: int(doc["count"]) async for doc in curs})
  return json_response(cnts)
예제 #11
0
async def _req_top_cocitauthors_pubs(request: web.Request) -> web.StreamResponse:
  """Топ N со-цитируемых авторов по публикациям"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True)
  out = {
    n: dict(all=cnt, pubs=Counter(c.rsplit('@', 1)[0] for c in conts))
    for n, cnt, conts in topN}

  return json_response(out)
예제 #12
0
async def _req_frags_topics_cocitauthors(
  request: web.Request
) -> web.StreamResponse:
  """Кросс-распределение «топики» - «со-цитирования»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  contexts = mdb.contexts

  topn_authors:int = getreqarg_int(request, 'topn_cocitauthors')
  if topn_authors:
    topNa = await _get_topn_cocit_authors(
      contexts, topn_authors, include_conts=False)
    exists = frozenset(t for t, _ in topNa)
  else:
    exists = ()

  topics = mdb.topics
  topN = await _get_topn_topics(topics, topn=topn)

  out_dict = {}
  for i, (topic, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)

    async for doc in contexts.aggregate([
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
      {'$unwind': '$cocit_authors'},
    ]):
      ngr = doc['cocit_authors']
      if topn_authors and ngr not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1

    crosscocitaith = {}
    out_dict[topic] = dict(sum=cnt, frags=frags, cocitaithors=crosscocitaith)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
예제 #13
0
async def _req_top_ngramm(request: web.Request) -> web.StreamResponse:
  """Топ N фраз"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')

  n_gramms = mdb.n_gramms
  topN = await _get_topn_ngramm(n_gramms, nka, ltype, topn)

  out = tuple(dict(title=n, contects=conts) for n, _, conts in topN)
  return json_response(out)
예제 #14
0
async def _req_top_topics_pubs(request: web.Request) -> web.StreamResponse:
  """Топ N топиков"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topics = mdb.topics
  topN = await _get_topn_topics(topics, topn=topn)

  # out = tuple(dict(title=n, contects=conts) for n, _, conts in topN)
  out = {
    n: dict(all=cnt, pubs=Counter(c.rsplit('@', 1)[0] for c in conts))
    for n, cnt, conts in topN}
  return json_response(out)
예제 #15
0
async def _reg_cnt_pubs_ngramm(request: web.Request) -> web.StreamResponse:
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  n_gramms = mdb.n_gramms
  publications:Collection = mdb.publications
  out = {}
  get_as_tuple = itemgetter('title', 'type', 'linked_papers')

  async for pobj in publications.find({'name': {'$exists': True}}):
    pub_id = pobj['_id']
    pipeline_work = [
      {'$match': {'linked_papers.cont_id': {'$regex': f'^{pub_id}@'}}}
    ] + pipeline
    out_ngrs = []

    cont_starts = pub_id + '@'
    async for obj in n_gramms.aggregate(pipeline_work):
      title, lt, conts = get_as_tuple(obj)
      res = dict(title=title) if ltype else dict(title=title, type=lt)
      res.update(count_all=obj['count_all'])
      cnt_all = cnt_cont = 0
      for cid, cnt in (c.values() for c in conts):
        if cid.startswith(cont_starts):
          cnt_cont += 1
          cnt_all += cnt
      res.update(count=cnt_all, count_conts=cnt_cont,
        # conts=conts
      )
      out_ngrs.append(res)

    out_ngrs = sorted(out_ngrs, key=itemgetter('count'), reverse=True)
    if topn:
      out_ngrs = out_ngrs[:topn]
    out[pub_id] = out_ngrs

  return json_response(out)
예제 #16
0
async def _req_frags_topics_topics(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «5 фрагментов» - «топики контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topics = mdb.topics
  topN = await _get_topn_topics(topics, topn=topn)

  contexts = mdb.contexts
  out_dict = {}
  for i, (ngrmm, cnt, conts) in enumerate(topN, 1):
    congr = defaultdict(partial(Counter, {n: 0 for n in range(1, 6)}))

    pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {
        'prefix': 0, 'suffix': 0, 'exact': 0, 'linked_papers_ngrams': 0,
        'positive_negative': 0, 'bundles': 0}},
      {'$unwind': '$linked_papers_topics'},
    ]
    # _logger.debug('ngrmm: "%s":, cnt %s, pipeline: %s', ngrmm, cnt, pipeline)
    async for doc in contexts.aggregate(pipeline):

      cont = doc['linked_papers_topics']
      ngr = cont['_id']

      fnum = doc['frag_num']
      congr[ngr][fnum] += 1

    frags = congr.pop(ngrmm)
    crosstopics = {}
    out_dict[ngrmm] = dict(sum=cnt, frags=frags, crosstopics=crosstopics)


    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
예제 #17
0
async def _req_frags_topics(request: web.Request) -> web.StreamResponse:
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topics = mdb.topics
  pipeline = [{
    '$lookup': {
      'from': 'contexts', 'localField': '_id',
      'foreignField': 'linked_papers_topics._id', 'as': 'cont'}},
    {'$unwind': '$cont'},
    {'$project': {
      'title': 1, 'linked_paper': '$cont.linked_papers_topics',
      'cont_id': '$cont._id', 'pub_id': '$cont.pub_id',
      'frag_num': '$cont.frag_num'}},
    {'$unwind': '$linked_paper'},
    {'$match': {
      'frag_num': {'$gt': 0}, '$expr': {'$eq': ['$_id', '$linked_paper._id']}}},
    {'$group': {
      '_id': {'title': '$title', 'frag_num': '$frag_num', },
      'count': {'$sum': 1}, }},
    {'$sort': {'_id': 1, 'count': -1}},
    {'$group': {
      '_id': '$_id.title', 'count': {'$sum': '$count'},
      'frags': {'$push': {'frag_num': '$_id.frag_num', 'count': '$count'}}, }},
    {'$sort': {'count': -1, '_id': 1}},
  ]
  if topn:
    pipeline += [{'$limit': topn}]

  out_dict = {}
  async for doc in topics.aggregate(pipeline):
    cnt = doc['count']
    title = doc['_id']
    frags = {n: 0 for n in range(1, 6)}
    frags.update(map(itemgetter('frag_num', 'count'), doc['frags']))
    out_dict[title] = dict(sum=cnt, frags=frags)

  return json_response(out_dict)
예제 #18
0
async def _req_top_ngramm_pubs(request: web.Request) -> web.StreamResponse:
  """Топ N фраз по публикациям"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  pipeline += [
    {'$unwind': '$linked_papers'},
    {'$group': {
      '_id': '$title', 'count': {'$sum': '$linked_papers.cnt'}, 'conts': {
        '$addToSet': {
          'cont_id': '$linked_papers.cont_id', 'cnt': '$linked_papers.cnt'}}}},
    {'$sort': {'count': -1, '_id': 1}},
  ]
  if topn:
    pipeline += [{'$limit': topn}]

  n_gramms = mdb.n_gramms
  get_as_tuple = itemgetter('_id', 'count', 'conts')
  topN = [get_as_tuple(obj) async for obj in n_gramms.aggregate(pipeline)]

  get_pubs = itemgetter('cont_id', 'cnt')
  out = {
    name: dict(
      all=cnt, contects=Counter(
        p for p, n in (
          (c.rsplit('@', 1)[0], n) for c, n in (get_pubs(co) for co in conts))
        for _ in range(n)
      ))
    for name, cnt, conts in topN}
  return json_response(out)
예제 #19
0
async def _req_frags_cocitauthors_ngramms(request: web.Request) -> web.StreamResponse:
  """Б Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn:int = getreqarg_topn(request)
  # if not topn:
  #   topn = 10
  topn_gramm:int = getreqarg_int(request, 'topn_gramm')
  if not topn_gramm:
    topn_gramm = 500

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  if topn_gramm:
    n_gramms = mdb.n_gramms
    top_ngramms = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn_gramm, title_always_id=True)
    exists = frozenset(t for t, _, _ in top_ngramms)
  else:
    exists = ()

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True)

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'cont')]

  pipeline += [
    {'$unwind': '$linked_papers_ngrams'},
    {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_ngrams._id']}}},
  ]

  out_dict = {}
  for i, (cocitauthor, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)
    titles = {}
    types = {}

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] +  pipeline

    # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline)
    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      ngr_title = cont['title']
      ngr_id = cont['_id']
      if topn_gramm and ngr_id not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      ngr_cnt = doc['linked_papers_ngrams']['cnt']
      congr[ngr_id][fnum] += ngr_cnt
      titles[ngr_id] = ngr_title
      types[ngr_id] = cont['type']

    crossgrams = []
    out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      ngr = dict(
        title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values()))
      crossgrams.append(ngr)

  return json_response(out_dict)
예제 #20
0
async def _req_publ_publications_cocitauthors(
  request: web.Request
) -> web.StreamResponse:
  """
  А
  Кросс-распределение «со-цитируемые авторы» по публикациям
  """
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  publications = mdb.publications
  pubs = {
    pdoc['_id']: pdoc['name']
    async for pdoc in publications.find({'name': {'$exists': True}})
  }

  pipeline = [
    #{'$match': {'pub_id': pub_id, 'cocit_authors': {'$exists': True}}},
    {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
    {'$unwind': '$cocit_authors'},
    {'$group': {
      '_id': '$cocit_authors', 'count': {'$sum': 1},
      'conts': {'$addToSet': '$_id'}}},
    {'$sort': {'count': -1, '_id': 1}},
  ]
  if topn:
    pipeline += [{'$limit': topn}, ]

  out_dict = {}

  contexts = mdb.contexts
  for pub_id, pub_desc in pubs.items():
    cocitauthors = {}
    out_dict[pub_id] = od = dict(descr=pub_desc)
    pipeline_work = [{
      '$match': {'pub_id': pub_id, 'cocit_authors': {'$exists': True}}},
    ] + pipeline

    conts2aut = defaultdict(set)
    async for doc in contexts.aggregate(pipeline_work):
      a = doc['_id']
      conts = doc['conts']
      cocitauthors[a] = tuple(sorted(conts))
      for c in conts:
        conts2aut[c].add(a)

    if conts2aut:
      od.update(
        conts=tuple(sorted(conts2aut.keys())), conts_len=len(conts2aut))
    if cocitauthors:
      # od.update(cocitauthors=cocitauthors)
      occa = {}
      for a, cs in cocitauthors.items():
        occa[a] = ac = defaultdict(set)
        for c in cs:
          for a2 in conts2aut[c]:
            if a2 == a:
              continue
            ac[a2].add(c)
      od.update(
        cocitauthors={
          a: {a2: tuple(sorted(cc)) for a2, cc in v.items()}
          for a, v in occa.items()},
        cocitauthors_len=len(occa))

  return json_response(out_dict)
예제 #21
0
async def _req_publ_ngramm_ngramm(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «публикации» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  n_gramms = mdb.n_gramms
  topN = await _get_topn_ngramm(
    n_gramms, nka, ltype, topn, title_always_id=True, show_type=True)
  exists = frozenset(t for t, *_ in topN)
  out_list = []
  contexts = mdb.contexts

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]
  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'cont')]

  pipeline += [
    {'$unwind': '$linked_papers_ngrams'},
    {'$match': {'$expr': {'$eq': ['$linked_papers_ngrams._id', '$cont._id']}}},
  ]
  for i, (ngrmm, typ_, cnt, conts) in enumerate(topN, 1):
    congr = defaultdict(set)
    titles = {}
    types = {}

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}
    ] + pipeline

    # _logger.debug('ngrmm: "%s", cnt: %s, pipeline: %s', ngrmm, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      ngr_id = cont['_id']
      if ngr_id not in exists:
        continue
      pub_id = doc['pub_id']
      congr[ngr_id].add(pub_id)
      titles[ngr_id] = cont['title']
      types[ngr_id] = cont['type']

    pubs = congr.pop(ngrmm, None)
    if not pubs:
      continue

    crossgrams = []
    oltype = ltype if ltype else types[ngrmm]

    out_list.append(
      dict(
        title=titles[ngrmm], type=oltype, pubs=tuple(sorted(pubs)),
        crossgrams=crossgrams, cnt_pubs=len(pubs), cnt_cross=len(congr)))

    enum_sort = enumerate(
      sorted(congr.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1)
    for j, (co, vals) in enum_sort:
      crossgrams.append(
        dict(
          title=titles[co], type=types[co], cnt=len(vals),
          pubs=tuple(sorted(vals))))

  return json_response(out_list)
예제 #22
0
async def _req_publ_publications_ngramms(
  request: web.Request
) -> web.StreamResponse:
  """Кросс-распределение «фразы из контекстов цитирований» по публикациям"""
  app = request.app
  mdb = app['db']

  publications = mdb.publications
  pubs = {
    pdoc['_id']: pdoc['name']
    async for pdoc in publications.find({'name': {'$exists': True}}).sort('_id')
  }

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  if nka or ltype:
    postmath = [
      {'$match': {
        f: v for f, v in (('cont.nka', nka), ('cont.type', ltype)) if v}}]
  else:
    postmath = None

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'linked_papers_topics': 0,
      'positive_negative': 0, 'bundles': 0},},
    {'$unwind': '$linked_papers_ngrams'},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]
  if postmath:
    pipeline += postmath

  contexts = mdb.contexts
  n_gramms = mdb.n_gramms

  out_pub_list = []

  for pub_id, pub_desc in pubs.items():
    topN = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn, pub_id=pub_id, title_always_id=True,
      show_type=True)
    exists = frozenset(map(itemgetter(0), topN))

    out_list = []
    oconts = set()

    for i, (ngrmm, ntype, cnt, conts) in enumerate(topN, 1):
      congr = defaultdict(set)
      ngrms = {}

      work_pipeline = [
        {'$match': {'_id': {'$in': conts}, 'pub_id': pub_id}}
      ] + pipeline + [
        {'$match': {'cont.type': ntype}}
      ]
      # _logger.debug('pipeline: %s', work_pipeline)
      async for doc in contexts.aggregate(work_pipeline):
        cont = doc['cont']
        ngr_id = cont['_id']
        ngr = cont['title']
        if ngr_id not in exists:
          continue
        cid = doc['_id']
        oconts.add(cid)
        congr[ngr_id].add(cid)
        ngrms[ngr_id] = dict(type=cont['type'], title=ngr, nka=cont['nka'])

      pubs = congr.pop(ngrmm)
      b_ngrm = ngrms.pop(ngrmm)
      crossgrams = []

      for j, (co, vals) in enumerate(
        sorted(congr.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1
      ):
        co_ = ngrms[co]
        crossgrams.append(
          dict(type=co_['type'], title=co_['title'], conts_len=len(vals)))

      out_list.append(dict(
        type=b_ngrm['type'], title=b_ngrm['title'], nka=b_ngrm['nka'],
        conts=tuple(sorted(pubs)), conts_len=len(pubs),
        crossgrams=crossgrams, crossgrams_len=len(crossgrams)))

    out_pub_list.append(dict(
      pub_id=pub_id, descr=pub_desc, ngrams=out_list, ngrams_len=len(out_list),
      conts=tuple(sorted(oconts)), conts_len=len(oconts)))

  return json_response(out_pub_list)
예제 #23
0
async def _req_frags_ngramm(request: web.Request) -> web.StreamResponse:
  """Распределение «5 фрагментов» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  pipeline = [
    {'$match': {
      'frag_num': {'$exists': 1}, 'linked_papers_ngrams': {'$exists': 1}}},
    {'$project': {
      '_id': 1, 'frag_num': 1, 'linked_paper': '$linked_papers_ngrams'}},
    {'$unwind': '$linked_paper'},
    {'$group': {
      '_id': {'_id': '$linked_paper._id', 'frag_num': '$frag_num'},
      'count': {'$sum': '$linked_paper.cnt'},}},
    {'$group': {
      '_id': '$_id._id', 'count': {'$sum': '$count'},
      'frags': {'$push': {'frag_num': '$_id.frag_num', 'count': '$count',}},}},
    {'$sort': {'count': -1, '_id': 1}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': '_id', 'foreignField': '_id',
      'as': 'ngramm'}},
    {'$unwind': '$ngramm'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'ngramm')]

  pipeline += [
    {'$project': {
      'title': '$ngramm.title', 'type': '$ngramm.type', 'nka': '$ngramm.nka',
      'count': '$count', 'frags': '$frags'}}]

  if topn:
    pipeline += [{'$limit': topn}]

  _logger.debug('pipeline: %s', pipeline)
  contexts = mdb.contexts
  out_dict = {}

  async for doc in contexts.aggregate(pipeline):
    title = doc['title']
    cnt = doc['count']
    frags = {n: 0 for n in range(1, 6)}
    frags.update(map(itemgetter('frag_num', 'count'), doc['frags']))
    dtype = doc['type']
    out = dict(sum=cnt, frags=frags)
    if not nka:
      out.update(nka=doc['nka'])
    if ltype:
      out_dict[title] = out
    else:
      out.update(type = dtype, title=title)
      did = doc['_id']
      out_dict[did] = out

  return json_response(out_dict)
예제 #24
0
async def _req_publ_publications_topics(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «топики контекстов цитирований» по публикациям"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  publications = mdb.publications
  pubs = {
    pdoc['_id']: pdoc['name']
    async for pdoc in publications.find(
      {'name': {'$exists': True}, 'uni_authors': 'Sergey-Sinelnikov-Murylev'})
  }

  topics = mdb.topics
  contexts = mdb.contexts

  out_pubs = []

  for pub_id, pub_desc in pubs.items():
    topN = await _get_topn_topics(topics, topn=topn, pub_id=pub_id)

    out_tops = []
    oconts = set()

    for i, (topic, cnt, conts) in enumerate(topN, 1):
      congr = defaultdict(set)

      pipeline = [
        {'$match': {'_id': {'$in': conts}, 'pub_id': pub_id}},
        {'$project': {
          'prefix': 0, 'suffix': 0, 'exact': 0, 'linked_papers_ngrams': 0,
          'bundles': 0, 'positive_negative': 0}},
        # {'$lookup': {
        #     'from': 'topics', 'localField': '_id',
        #     'foreignField': 'linked_papers.cont_id', 'as': 'cont'}},
        # {'$unwind': '$cont'},
        # {'$unwind': '$cont.linked_papers'},
        # {'$match': {'$expr': {'$eq': ["$_id", "$cont.linked_papers.cont_id"]}}},
        # {'$project': {'cont.type': False}},  # 'cont.linked_papers': False,
        # {'$sort': {'frag_num': 1}},
        {'$unwind': '$linked_papers_topics'},
      ]
      # _logger.debug('pipeline: %s', pipeline)
      async for doc in contexts.aggregate(pipeline):
        ngr = doc['linked_papers_topics']['_id']
        cid = doc['_id']
        oconts.add(cid)
        congr[ngr].add(cid)

      pubs = congr.pop(topic, ())
      crosstopics = []

      for j, (co, vals) in enumerate(
        sorted(congr.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1
      ):
        crosstopics.append(dict(
          topic=co, cnt=len(vals), topics=tuple(sorted(vals))))

      out_tops.append(dict(
        topic=topic, conts=tuple(sorted(pubs)), conts_len=len(pubs),
        crosstopics=crosstopics, crosstopics_len=len(crosstopics)))

    out_pubs.append(dict(
      pub_id=pub_id,
      descr=pub_desc, topics_len=len(out_tops), topics=out_tops,
      conts_len=len(oconts), conts=tuple(sorted(oconts)),
    ))

  return json_response(out_pubs)
예제 #25
0
async def _req_frags_topics_ngramms(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «топики» - «фразы»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topn_crpssgramm: int = getreqarg_int(request, 'topn_crpssgramm')
  topn_gramm: int = getreqarg_int(request, 'topn_gramm')
  if not topn_gramm:
    topn_gramm = 500

  nka = getreqarg_nka(request)
  ltype = getreqarg_ltype(request)

  if topn_gramm:
    n_gramms = mdb.n_gramms

    top_ngramms = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn_gramm, title_always_id=True)
    exists = frozenset(t for t, _, _ in top_ngramms)
  else:
    exists = ()

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$unwind': '$linked_papers_ngrams'},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'ngrm'}},
    {'$unwind': '$ngrm'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'ngrm')]

  pipeline += [
    {'$sort': {'ngrm.count_in_linked_papers': -1, 'ngrm.count_all': -1}},
  ]

  top_topics = await _get_topn_topics(mdb.topics, topn)
  contexts = mdb.contexts
  out_dict = {}
  zerro_frags = {n: 0 for n in range(1, 6)}
  for i, (topic, cnt, conts) in enumerate(top_topics, 1):
    frags = Counter(zerro_frags)
    congr = defaultdict(partial(Counter, zerro_frags))

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] + pipeline

    # _logger.debug('topic: "%s", cnt: %s, pipeline: %s', topic, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['ngrm']
      ngr = cont['title']
      if exists and cont['_id'] not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1
      if topn_crpssgramm and len(congr) == topn_crpssgramm:
        break

    crossgrams = {}
    out_dict[topic] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
예제 #26
0
async def _req_frags_cocitauthors_cocitauthors(
  request: web.Request
) -> web.StreamResponse:
  """
  А
  Кросс-распределение «5 фрагментов» - «со-цитируемые авторы»
  """
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  pipeline = [
    {'$match': {
      'cocit_authors': {'$exists': True}, 'frag_num': {'$exists': True},}},
    {'$project': {
      'pub_id': True, 'cocit_authors': True, 'frag_num': True}},
    {'$lookup': {
      'from': 'publications', 'localField': 'pub_id', 'foreignField': '_id',
      'as': 'pub'}},
    {'$unwind': '$pub'},
    {'$match': {'pub.uni_authors': 'Sergey-Sinelnikov-Murylev'}},
    {'$project': {'pub': False}},
    {'$unwind': '$cocit_authors'},
    {'$lookup': {
      'from': 'contexts', 'localField': '_id', 'foreignField': '_id',
      'as': 'cont'}},
    {'$project': {
        'pub_id': True, 'cocit_authors': True, 'frag_num': True,
        'cont.cocit_authors': True}},
    {'$unwind': '$cont'},
    {'$unwind': '$cont.cocit_authors'},
    {'$match': {'$expr': {'$ne': ['$cocit_authors', '$cont.cocit_authors']}}},
    {'$group': {
      '_id': {
        'cocitauthor1': {
          '$cond': [{'$gte': ['$cocit_authors', '$cont.cocit_authors'], },
            '$cont.cocit_authors', '$cocit_authors']},
        'cocitauthor2': {
          '$cond': [{'$gte': ['$cocit_authors', '$cont.cocit_authors'], },
            '$cocit_authors', '$cont.cocit_authors']},
        'cont_id': '$_id'},
      'cont': {
        '$first': {
          'pub_id': '$pub_id', 'frag_num': '$frag_num', }}, }},
    {'$sort': {'_id': 1}},
    {'$group': {
      '_id': {
        'cocitauthor1': '$_id.cocitauthor1',
        'cocitauthor2': '$_id.cocitauthor2'},
      'count': {'$sum': 1},
      'conts': {
        '$push': {
          'cont_id': '$_id.cont_id', 'pub_id': '$cont.pub_id',
          'frag_num': '$cont.frag_num', }, }, }},
    {'$sort': {'count': -1, '_id': 1}},
    {'$project': {
      '_id': False,
      'cocitpair': {
        'author1': '$_id.cocitauthor1', 'author2': '$_id.cocitauthor2'},
      'count': '$count', 'conts': '$conts', }},
  ]
  if topn:
    pipeline += [{'$limit': topn}]
  contexts = mdb.contexts
  curs = contexts.aggregate(pipeline)
  out = []

  async for doc in curs:
    cocitpair = doc['cocitpair']
    conts = doc['conts']
    pub_ids = tuple(frozenset(map(itemgetter('pub_id'), conts)))
    cont_ids = tuple(map(itemgetter('cont_id'), conts))
    frags = Counter(map(itemgetter('frag_num'), conts))
    out.append(dict(
      cocitpair=tuple(cocitpair.values()),
      cont_cnt=len(cont_ids), pub_cnt=len(pub_ids),
      frags=dict(sorted(frags.items())), pub_ids=pub_ids, cont_ids=cont_ids))

  return json_response(out)
예제 #27
0
async def _req_publ_cocitauthors_cocitauthors(
  request: web.Request
) -> web.StreamResponse:
  """
  А
  Кросс-распределение «публикации» - «со-цитируемые авторы»
  """
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  topn_cocitauthors: int = getreqarg_int(request, 'topn_cocitauthors')

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn)

  exists = ()
  if topn_cocitauthors:
    if not topn or topn >= topn_cocitauthors:
      exists = frozenset(map(itemgetter(0), topN[:topn_cocitauthors]))
    else:
      topNa = await _get_topn_cocit_authors(contexts, topn_cocitauthors)
      exists = frozenset(map(itemgetter(0), topNa))

  out_list = []
  for i, (cocitauthor, _) in enumerate(topN, 1):
    cnt = 0
    pubs = set()
    coaut = defaultdict(set)
    async for doc in contexts.find(
      {'cocit_authors': cocitauthor, 'frag_num': {'$gt': 0}},
      projection=['pub_id', 'cocit_authors']
    ).sort('frag_num'):
      # print(i, doc)
      if exists:
        coauthors = frozenset(
          c for c in doc['cocit_authors'] if c != cocitauthor and c in exists)
      else:
        coauthors = frozenset(
          c for c in doc['cocit_authors'] if c != cocitauthor)
      if not coauthors:
        continue
      cnt += 1
      pub_id = doc['pub_id']
      pubs.add(pub_id)
      for ca in coauthors:
        coaut[ca].add(pub_id)

    if not coaut:
      continue

    out_cocitauthors = []
    out_list.append(
      dict(
        title=cocitauthor, cnt_pubs=len(pubs), cnt_cross=len(coaut),
        pubs=tuple(sorted(pubs)), cocitauthors=out_cocitauthors))

    for j, (co, vals) in enumerate(
      sorted(coaut.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1
    ):
      out_cocitauthors.append(
        dict(title=co, cnt_pubs=len(vals), pubs=tuple(sorted(vals))))

  return json_response(out_list)