def get_cmp_authors_cont(ap1: AuthorParam, ap2: AuthorParam, word: str, field_col: FieldsSet, ngrmpr: Optional[NgrammParam], probability: Optional[float]) -> list: assert ap1.only_one() and ap2.only_one() atype1, name1 = ap1.get_qual_auth() atype2, name2 = ap2.get_qual_auth() return get_cmp_authors_ref_cont(atype1, name1, atype2, name2, word, field_col, ngrmpr, probability)
async def _req_common2authors( authorParams1: AuthorParam = Depends(depAuthorParamOnlyOne), authorParams2: AuthorParam = Depends(depAuthorParamOnlyOne2), ngrmpr: NgrammParam = Depends(depNgrammParamReq), probability: Optional[float] = .5, _debug_option: Optional[DebugOption] = None, slot: Slot = Depends(Slot.req2slot)): pipelines = get_cmp_authors(authorParams1, authorParams2, ngrmpr, probability) if _debug_option == DebugOption.pipeline: return pipelines atype1, name1 = authorParams1.get_qual_auth() atype2, name2 = authorParams2.get_qual_auth() def_vals = dict(common=1, union=1) if authorParams1 == authorParams2: out = dict(author1=dict(atype=atype1, name=name1), author2=dict(atype=atype2, name=name2), **{k: def_vals for k in FieldsSet}) return out coll: Collection = slot.mdb.publications if _debug_option == DebugOption.raw_out: out = {} for key, pipeline in pipelines.items(): curs = coll.aggregate(pipeline) out_lst = [doc async for doc in curs] out[key] = out_lst return out vals = {} for key, pipeline in pipelines.items(): curs = coll.aggregate(pipeline) cnts1, cnts2 = await collect_cmp_vals(atype1, name1, atype2, name2, curs) keys_union = cnts1.keys() | cnts2.keys() keys_intersect = cnts1.keys() & cnts2.keys() words = sorted((w, cnts1[w], cnts2[w]) for w in keys_intersect) if key == FieldsSet.ngram: len_pref = len(ngrmpr.ltype.value) + 1 words = ((w[len_pref:], c1, c2) for w, c1, c2 in words) common_words = [ dict(word=w, author1=c1, author2=c2) for w, c1, c2 in words ] vals[key] = dict(common=len(keys_intersect), union=len(keys_union), common_words=common_words) out = dict(author1=dict(atype=atype1, name=name1), author2=dict(atype=atype2, name=name2), **vals) return out
def get_cmp_authors(ap1: AuthorParam, ap2: AuthorParam, ngrmpr: NgrammParam, probability: float) -> Dict[str, list]: assert ap1.only_one() and ap2.only_one() atype1, name1 = ap1.get_qual_auth() atype2, name2 = ap2.get_qual_auth() pipelines = {} for fld_set in FieldsSet: # type: fld_set: FieldsSet pipeline = get_cmp_authors_ref(atype1, name1, atype2, name2, fld_set, ngrmpr, probability) pipelines[fld_set] = pipeline return pipelines
async def _req_pubs_refauthors( top_auth:Optional[int]=3, authorParams:AuthorParam=Depends(), _debug_option:Optional[DebugOption]=None, slot:Slot=Depends(Slot.req2slot) ): pipeline = get_refauthors_part(top_auth, AuthorParam()) if _debug_option == DebugOption.pipeline: return pipeline publications: Collection = slot.mdb.publications contexts: Collection = slot.mdb.contexts out = [] async for pub in publications.find( # {'uni_authors': 'Sergey-Sinelnikov-Murylev'}, {'name': {'$exists': 1}, **filter_acc_dict(authorParams),}, projection={'_id': True, 'name': True}, sort=[('_id', ASCENDING)] ): pid = pub['_id'] pub_pipeline = [{'$match': {'pubid': pid}}] + pipeline ref_authors = [] async for row in contexts.aggregate(pub_pipeline): row.pop('pos_neg', None) row.pop('frags', None) ref_authors.append(row) pub_out = dict(pubid=pid, name=pub['name'], ref_authors=ref_authors) out.append(pub_out) return out
async def _req_common2authors_field( field: FieldsSet, authorParams1: AuthorParam, authorParams2: AuthorParam, word: Optional[str], *, ngrmpr: Optional[NgrammParam] = None, probability: Optional[float] = None, slot: Slot, _debug_option: Optional[DebugOption] = None, ): pipeline = get_cmp_authors_cont(authorParams1, authorParams2, word, field, ngrmpr, probability) if _debug_option == DebugOption.pipeline: return pipeline coll: Collection = slot.mdb.publications curs = coll.aggregate(pipeline, allowDiskUse=True) if _debug_option == DebugOption.raw_out: out = [doc async for doc in curs] return out atype1, name1 = authorParams1.get_qual_auth() atype2, name2 = authorParams2.get_qual_auth() (set1, conts1), (set2, conts2) = await collect_cmp_vals_conts( atype1, name1, atype2, name2, curs) keys_union = set1.keys() | set2.keys() keys_intersect = set1.keys() & set2.keys() words = sorted((w, set1[w], set2[w]) for w in keys_intersect) len_pref = len(ngrmpr.ltype.value) + 1 if field == FieldsSet.ngram else 0 common_words = [ dict(word=w[len_pref:], author1=dict(cnt=c1, conts=sorted(conts1.get(w, ()))), author2=dict(cnt=c2, conts=sorted(conts2.get(w, ())))) for w, c1, c2 in words ] out = dict(author1=dict(atype=atype1, name=name1), author2=dict(atype=atype2, name=name2), common=len(keys_intersect), union=len(keys_union), common_words=common_words) return out
async def _req_compare2authors( authorParams1: AuthorParam = Depends(depAuthorParamOnlyOne), authorParams2: AuthorParam = Depends(depAuthorParamOnlyOne2), ngrmpr: NgrammParam = Depends(depNgrammParamReq), probability: Optional[float] = .5, _debug_option: Optional[DebugOption] = None, slot: Slot = Depends(Slot.req2slot)): pipelines = get_cmp_authors(authorParams1, authorParams2, ngrmpr, probability) if _debug_option == DebugOption.pipeline: return pipelines atype1, name1 = authorParams1.get_qual_auth() atype2, name2 = authorParams2.get_qual_auth() def_vals = dict(common=1, union=1, yaccard=1, jensen_shannon=0) if authorParams1 == authorParams2: out = dict(author1=dict(atype=atype1, name=name1), author2=dict(atype=atype2, name=name2), **{k: def_vals for k in FieldsSet}) return out coll: Collection = slot.mdb.publications if _debug_option == DebugOption.raw_out: out = {} for key, pipeline in pipelines.items(): curs = coll.aggregate(pipeline) out_lst = [doc async for doc in curs] out[key] = out_lst return out vals = {} for key, pipeline in pipelines.items(): curs = coll.aggregate(pipeline) calc_vals = await calc_cmp_vals(atype1, name1, atype2, name2, curs, key) vals[key] = calc_vals out = dict(author1=dict(atype=atype1, name=name1), author2=dict(atype=atype2, name=name2), **vals) return out
def filter_acc_dict(ap: AuthorParam) -> Dict[str, str]: """Фильтр по author, cited, citing""" if ap.is_empty(): return {} match = { f'uni_{key}': val for key, val in (('authors', ap.author), ('cited', ap.cited), ('citing', ap.citing)) if val} return match
def filter_by_pubs_acc(authParams: AuthorParam) -> List[dict]: if authParams.is_empty(): return [] match = filter_acc_dict(authParams) pipeline = [ {'$lookup': { 'from': 'publications', 'localField': 'pubid', 'foreignField': '_id', 'as': 'pub'}}, {'$unwind': '$pub'}, # {'$match': {'pub.uni_authors': {'$exists': 1}}}, {'$match': {f'pub.{key}': val for key, val in match.items()}}, ] return pipeline
def filter_by_topic( ap: AuthorParam, *, as_field:str= 'topic' ): if ap.is_empty(): return [] pipeline = [ { '$match': { "$or": [ {f'{as_field}.uni_{fld}': val} for fld, val in (('author', ap.author), ('cited', ap.cited), ('citing', ap.citing),) if val]}}, ] return pipeline
def depAuthorParamOnlyOne2( ap:AuthorParamOnlyOne2=Depends() ) -> AuthorParam: params = AuthorParam(author=ap.author2, cited=ap.cited2, citing=ap.citing2) return params
def depAuthorParamOnlyOne( authorParams:AuthorParamOnlyOne=Depends() ) -> AuthorParam: res = AuthorParam(**authorParams.dict()) return res