Python get_ngramm_filter示例，server_get_top.get_ngramm_filter Python示例

示例#1

0

显示文件

async def _reg_cnt_ngramm(request: web.Request) -> web.StreamResponse:
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  pipeline += [{'$sort': {'count_all': -1, 'title': 1, 'type': 1}}]
  if topn:
    pipeline += [{'$limit': topn}]

  out = []
  get_as_tuple = itemgetter('title', 'type', 'linked_papers')
  n_gramms = mdb.n_gramms
  async for doc in n_gramms.aggregate(pipeline):
    title, lt, conts = get_as_tuple(doc)
    res = dict(title=title) if ltype else dict(title=title, type=lt)
    cnt_all = cnt_cont = 0
    pubs = set()
    for cid, cnt in (c.values() for c in conts):
      cnt_cont += 1
      cnt_all += cnt
      pubs.add(cid.rsplit('@', 1)[0])
    res.update(
      count_all=doc['count_all'], count=cnt_all, count_conts=cnt_cont,
      conts_pubs=len(pubs))
    out.append(res)

  return json_response(out)

示例#2

0

显示文件

async def _reg_cnt_pubs_ngramm(request: web.Request) -> web.StreamResponse:
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  n_gramms = mdb.n_gramms
  publications:Collection = mdb.publications
  out = {}
  get_as_tuple = itemgetter('title', 'type', 'linked_papers')

  async for pobj in publications.find({'name': {'$exists': True}}):
    pub_id = pobj['_id']
    pipeline_work = [
      {'$match': {'linked_papers.cont_id': {'$regex': f'^{pub_id}@'}}}
    ] + pipeline
    out_ngrs = []

    cont_starts = pub_id + '@'
    async for obj in n_gramms.aggregate(pipeline_work):
      title, lt, conts = get_as_tuple(obj)
      res = dict(title=title) if ltype else dict(title=title, type=lt)
      res.update(count_all=obj['count_all'])
      cnt_all = cnt_cont = 0
      for cid, cnt in (c.values() for c in conts):
        if cid.startswith(cont_starts):
          cnt_cont += 1
          cnt_all += cnt
      res.update(count=cnt_all, count_conts=cnt_cont,
        # conts=conts
      )
      out_ngrs.append(res)

    out_ngrs = sorted(out_ngrs, key=itemgetter('count'), reverse=True)
    if topn:
      out_ngrs = out_ngrs[:topn]
    out[pub_id] = out_ngrs

  return json_response(out)

示例#3

0

显示文件

async def _req_top_ngramm_pubs(request: web.Request) -> web.StreamResponse:
  """Топ N фраз по публикациям"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  pipeline += [
    {'$unwind': '$linked_papers'},
    {'$group': {
      '_id': '$title', 'count': {'$sum': '$linked_papers.cnt'}, 'conts': {
        '$addToSet': {
          'cont_id': '$linked_papers.cont_id', 'cnt': '$linked_papers.cnt'}}}},
    {'$sort': {'count': -1, '_id': 1}},
  ]
  if topn:
    pipeline += [{'$limit': topn}]

  n_gramms = mdb.n_gramms
  get_as_tuple = itemgetter('_id', 'count', 'conts')
  topN = [get_as_tuple(obj) async for obj in n_gramms.aggregate(pipeline)]

  get_pubs = itemgetter('cont_id', 'cnt')
  out = {
    name: dict(
      all=cnt, contects=Counter(
        p for p, n in (
          (c.rsplit('@', 1)[0], n) for c, n in (get_pubs(co) for co in conts))
        for _ in range(n)
      ))
    for name, cnt, conts in topN}
  return json_response(out)

示例#4

0

显示文件

async def _req_publ_ngramm_ngramm(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «публикации» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  n_gramms = mdb.n_gramms
  topN = await _get_topn_ngramm(
    n_gramms, nka, ltype, topn, title_always_id=True, show_type=True)
  exists = frozenset(t for t, *_ in topN)
  out_list = []
  contexts = mdb.contexts

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]
  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'cont')]

  pipeline += [
    {'$unwind': '$linked_papers_ngrams'},
    {'$match': {'$expr': {'$eq': ['$linked_papers_ngrams._id', '$cont._id']}}},
  ]
  for i, (ngrmm, typ_, cnt, conts) in enumerate(topN, 1):
    congr = defaultdict(set)
    titles = {}
    types = {}

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}
    ] + pipeline

    # _logger.debug('ngrmm: "%s", cnt: %s, pipeline: %s', ngrmm, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      ngr_id = cont['_id']
      if ngr_id not in exists:
        continue
      pub_id = doc['pub_id']
      congr[ngr_id].add(pub_id)
      titles[ngr_id] = cont['title']
      types[ngr_id] = cont['type']

    pubs = congr.pop(ngrmm, None)
    if not pubs:
      continue

    crossgrams = []
    oltype = ltype if ltype else types[ngrmm]

    out_list.append(
      dict(
        title=titles[ngrmm], type=oltype, pubs=tuple(sorted(pubs)),
        crossgrams=crossgrams, cnt_pubs=len(pubs), cnt_cross=len(congr)))

    enum_sort = enumerate(
      sorted(congr.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1)
    for j, (co, vals) in enum_sort:
      crossgrams.append(
        dict(
          title=titles[co], type=types[co], cnt=len(vals),
          pubs=tuple(sorted(vals))))

  return json_response(out_list)

示例#5

0

显示文件

async def _req_frags_ngramm(request: web.Request) -> web.StreamResponse:
  """Распределение «5 фрагментов» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  pipeline = [
    {'$match': {
      'frag_num': {'$exists': 1}, 'linked_papers_ngrams': {'$exists': 1}}},
    {'$project': {
      '_id': 1, 'frag_num': 1, 'linked_paper': '$linked_papers_ngrams'}},
    {'$unwind': '$linked_paper'},
    {'$group': {
      '_id': {'_id': '$linked_paper._id', 'frag_num': '$frag_num'},
      'count': {'$sum': '$linked_paper.cnt'},}},
    {'$group': {
      '_id': '$_id._id', 'count': {'$sum': '$count'},
      'frags': {'$push': {'frag_num': '$_id.frag_num', 'count': '$count',}},}},
    {'$sort': {'count': -1, '_id': 1}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': '_id', 'foreignField': '_id',
      'as': 'ngramm'}},
    {'$unwind': '$ngramm'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'ngramm')]

  pipeline += [
    {'$project': {
      'title': '$ngramm.title', 'type': '$ngramm.type', 'nka': '$ngramm.nka',
      'count': '$count', 'frags': '$frags'}}]

  if topn:
    pipeline += [{'$limit': topn}]

  _logger.debug('pipeline: %s', pipeline)
  contexts = mdb.contexts
  out_dict = {}

  async for doc in contexts.aggregate(pipeline):
    title = doc['title']
    cnt = doc['count']
    frags = {n: 0 for n in range(1, 6)}
    frags.update(map(itemgetter('frag_num', 'count'), doc['frags']))
    dtype = doc['type']
    out = dict(sum=cnt, frags=frags)
    if not nka:
      out.update(nka=doc['nka'])
    if ltype:
      out_dict[title] = out
    else:
      out.update(type = dtype, title=title)
      did = doc['_id']
      out_dict[did] = out

  return json_response(out_dict)

示例#6

0

显示文件

async def _req_frags_cocitauthors_ngramms(request: web.Request) -> web.StreamResponse:
  """Б Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn:int = getreqarg_topn(request)
  # if not topn:
  #   topn = 10
  topn_gramm:int = getreqarg_int(request, 'topn_gramm')
  if not topn_gramm:
    topn_gramm = 500

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  if topn_gramm:
    n_gramms = mdb.n_gramms
    top_ngramms = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn_gramm, title_always_id=True)
    exists = frozenset(t for t, _, _ in top_ngramms)
  else:
    exists = ()

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True)

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'cont')]

  pipeline += [
    {'$unwind': '$linked_papers_ngrams'},
    {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_ngrams._id']}}},
  ]

  out_dict = {}
  for i, (cocitauthor, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)
    titles = {}
    types = {}

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] +  pipeline

    # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline)
    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      ngr_title = cont['title']
      ngr_id = cont['_id']
      if topn_gramm and ngr_id not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      ngr_cnt = doc['linked_papers_ngrams']['cnt']
      congr[ngr_id][fnum] += ngr_cnt
      titles[ngr_id] = ngr_title
      types[ngr_id] = cont['type']

    crossgrams = []
    out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      ngr = dict(
        title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values()))
      crossgrams.append(ngr)

  return json_response(out_dict)

示例#7

0

显示文件

async def _req_frags_topics_ngramms(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «топики» - «фразы»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topn_crpssgramm: int = getreqarg_int(request, 'topn_crpssgramm')
  topn_gramm: int = getreqarg_int(request, 'topn_gramm')
  if not topn_gramm:
    topn_gramm = 500

  nka = getreqarg_nka(request)
  ltype = getreqarg_ltype(request)

  if topn_gramm:
    n_gramms = mdb.n_gramms

    top_ngramms = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn_gramm, title_always_id=True)
    exists = frozenset(t for t, _, _ in top_ngramms)
  else:
    exists = ()

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$unwind': '$linked_papers_ngrams'},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'ngrm'}},
    {'$unwind': '$ngrm'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'ngrm')]

  pipeline += [
    {'$sort': {'ngrm.count_in_linked_papers': -1, 'ngrm.count_all': -1}},
  ]

  top_topics = await _get_topn_topics(mdb.topics, topn)
  contexts = mdb.contexts
  out_dict = {}
  zerro_frags = {n: 0 for n in range(1, 6)}
  for i, (topic, cnt, conts) in enumerate(top_topics, 1):
    frags = Counter(zerro_frags)
    congr = defaultdict(partial(Counter, zerro_frags))

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] + pipeline

    # _logger.debug('topic: "%s", cnt: %s, pipeline: %s', topic, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['ngrm']
      ngr = cont['title']
      if exists and cont['_id'] not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1
      if topn_crpssgramm and len(congr) == topn_crpssgramm:
        break

    crossgrams = {}
    out_dict[topic] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)