def query_vocab_by_title_id(self, title_id): tid = list(set(title_id)) psql = PsqlQuery() vocab2title, schema = psql.query_all(self.query_vocab2post_by_tid_sql, (tuple(tid), )) vocab_id = list({v2t[schema['vocabulary_id']] for v2t in vocab2title}) vocab, vschema = psql.query_all(self.query_vocab_by_id, (tuple(vocab_id), )) return vocab, vschema
def query_vocab_by_post_id(self, post_id): pid = list(set(post_id)) psql = PsqlQuery() vocab2post, schema = psql.query_all(self.query_vocab2post_by_pid_sql, (tuple(pid), )) vocab_id = list({v2p[schema['vocabulary_id']] for v2p in vocab2post}) vocab, vschema = psql.query_all(self.query_vocab_by_id, (tuple(vocab_id), )) return vocab, vschema
def query_vocab_by_comment_id(self, comment_id): cmtid = list(set(comment_id)) psql = PsqlQuery() vocab2comment, schema = psql.query_all( self.query_vocab2comment_by_cmtid_sql, (tuple(cmtid), )) vocab_id = list( {v2c[schema['vocabulary_id']] for v2c in vocab2comment}) vocab, vschema = psql.query_all(self.query_vocab_by_id, (tuple(vocab_id), )) return vocab, vschema
def guery_vocab_group_by_comment_id(self, comment_id): psql = PsqlQuery() comment2vocab, schema = psql.query_all( self.guery_vocab_group_by_comment_id_sql, (tuple(comment_id), )) return comment2vocab, schema
def guery_vocab_group_by_title_id(self, title_id): psql = PsqlQuery() title2vocab, schema = psql.query_all( self.guery_vocab_group_by_title_id_sql, (tuple(title_id), )) return title2vocab, schema
def query_comment_by_post(self, post_id): bundle = [(id_, self.tokenizer_tag) for id_ in post_id] psql = PsqlQuery() comment, schema = psql.query_all(self.query_comment_by_unique_sql, (tuple(bundle), )) return comment, schema
def get_comment_obj(self, post_id): if not bool(post_id): return [] # Bottleneck ? comments, cmtschema = self.query_comment_by_post(post_id) # cmtid = [cmt[cmtschema['id']] for cmt in comments] cmt2vocab, c2vschema = self.guery_vocab_group_by_comment_id(cmtid) vid = list({ v for c2v in cmt2vocab for v in c2v[c2vschema['vocabulary_group']] }) if not bool(cmtid): return [] psql = PsqlQuery() cvocab, vschema = psql.query_all(self.query_vocab_by_id_sql, (tuple(vid), )) c2v_dict = { c2v[c2vschema['comment_id']]: c2v[c2vschema['vocabulary_group']] for c2v in cmt2vocab } v_dict = {v[vschema['id']]: v for v in cvocab} comment_objs = [] for i, cmt in enumerate(comments): if cmt[cmtschema['id']] not in self.excluded_comment_ids: if cmt[cmtschema['id']] in c2v_dict: vocabs = [ self._construct_vocab(v_dict[vid], vschema) for vid in c2v_dict[cmt[cmtschema['id']]] ] else: vocabs = [] comment_objs.append( Comment(vocabs, self.tokenizer_tag, post_id=cmt[cmtschema['post_id']], audience=cmt[cmtschema['audience_id']], quality=cmt[cmtschema['quality']], ctype=cmt[cmtschema['ctype']], retrieval_count=cmt[cmtschema['retrieval_count']], floor=cmt[cmtschema['floor']], id_=cmt[cmtschema['id']], body=''.join(cmt[cmtschema['tokenized']].split()))) if i > self.max_query_comment_num: break return comment_objs
def get_title_obj(self, vocab_id): if not bool(vocab_id): return [] # Bottleneck ? v2t, v2tschema = self.query_vocab2title(vocab_id) fltr_tid = [ q[v2tschema['title_id']] for q in v2t if q[v2tschema['title_id']] not in self.excluded_title_ids ] # title2vocab, t2vschema = self.guery_vocab_group_by_title_id(fltr_tid) tid = list({t2v[t2vschema['title_id']] for t2v in title2vocab}) vid = list({ v for t2v in title2vocab for v in t2v[t2vschema['vocabulary_group']] }) if not bool(tid): return [] title_generator, tschema = self.query_title_by_id(tid) psql = PsqlQuery() tvocab, vschema = psql.query_all(self.query_vocab_by_id_sql, (tuple(vid), )) t2v_dict = { t2v[t2vschema['title_id']]: t2v[t2vschema['vocabulary_group']] for t2v in title2vocab } v_dict = {v[vschema['id']]: v for v in tvocab} title_objs = [] for i, tt in enumerate(title_generator): if tt[tschema['post_id']] not in self.excluded_post_ids: vocabs = [ self._construct_vocab(v_dict[vid], vschema) for vid in t2v_dict[tt[tschema['id']]] ] title_objs.append( Title(vocabs, self.tokenizer_tag, post_id=tt[tschema['post_id']], quality=tt[tschema['quality']], ctype=tt[tschema['quality']], retrieval_count=tt[tschema['quality']], body=''.join(tt[tschema['tokenized']].split()), id_=tt[tschema['id']])) if i >= self.max_query_title_num: break return title_objs
def guery_vocab_group_by_title_using_vocab(self, vocab_id, ex_title_id): psql = PsqlQuery() if not bool(ex_title_id): ex_title_id = [-1] title2vocab, schema = psql.query_all( self.guery_vocab_group_by_title_using_vocab_id_sql, { 'vid': tuple(vocab_id), 'tid': tuple(ex_title_id) }) return title2vocab, schema
def query_vocab_by_words(self, wds, relative_words=None): words = list(wds) if bool(relative_words): try: words += list(relative_words) except Exception as err: self.logger.warning(err) bundle = [(w.word, w.pos, self.tokenizer_tag) for w in words] psql = PsqlQuery() qvocab, vschema = psql.query_all(self.query_vocab_sql, (tuple(bundle), )) return qvocab, vschema
def query_vocab2post(self, vocab_id): psql = PsqlQuery() vocab2post, schema = psql.query_all(self.query_vocab2post_by_vid_sql, (tuple(vocab_id), )) return [v2p[schema['post_id']] for v2p in vocab2post]
def query_comment_quality_by_id(self, comment_id): psql = PsqlQuery() vocab, vschema = psql.query_all(self.query_comment_quality_by_id_sql, (comment_id, )) return vocab, vschema
def query_title_quality_by_id(self, title_id): psql = PsqlQuery() vocab, vschema = psql.query_all(self.query_title_quality_by_id_sql, (title_id, )) return vocab, vschema
def query_vocab_quality_by_id(self, vocab_word): psql = PsqlQuery() vocab, vschema = psql.query_all(self.query_vocab_quality_by_word_sql, (vocab_word, )) return vocab, vschema
def _query_all(self, sql_string, data=None): psql = PsqlQuery() fetched, schema = psql.query_all(sql_string, data) return fetched, schema
return [v for v in cmt.vocabs] return [extract(cmt) for cmt in comments] if __name__ == '__main__': with open('eval0829.csv', 'w') as f: f.write('random, base, pweight\n') psql = PsqlQuery() posts = psql.query(query_post_sql) pschema = psql.schema valid_post = 0 for idx, p in enumerate(posts): titles, tschema = psql.query_all( query_title_sql, dict(pid=p[pschema['id']], tok='jieba')) basic_retriever = RetrievalEvaluate( 'jieba', excluded_post_ids=[p[pschema['id']]], logger_name='retrieve') pweight_retriever = RetrievalEvaluate( 'jieba', excluded_post_ids=[p[pschema['id']]], pweight=JiebaPosWeight.weight, logger_name='retrieve') query = ' '.join([ '{}:{}'.format(w, p) for w, p in zip(titles[0][tschema['tokenized']].split(), titles[0][
def query_vocab2title(self, vocab_id): psql = PsqlQuery() vocab2post, schema = psql.query_all(self.query_vocab2title_by_vid_sql, (tuple(vocab_id), )) return vocab2post, schema