def cut_query(fn, fn_out): with open(fn_out, 'w') as fout: with open(fn) as fin: for line in fin: data = json.loads(line, encoding='utf8') cut_data = text_cutter.process({'title': data['query'].encode('utf8')}) data['cut_query'] = cut_data['cut_title'].decode('utf8') data.pop('query') print >> fout, json.dumps(data, ensure_ascii=False).encode('utf8')
def get_k_nearest_title(self, title, k): if isinstance(title, unicode): title = title.encode('utf8') cut_data = text_cutter.process({'title': title}) title = cut_data['cut_title'].decode('utf8') vecs, valid_titles = self.model.get_title_vec([title]) if len(valid_titles) == 0: return [] vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True)) vec = list(vecs)[0] k_neighbors, scores = self.title_ann.get_nns_by_vector(vec, n=k, include_distances=True) neighbors = [] for i in k_neighbors: neighbors.append(self.titles[i]) return sorted(zip(neighbors, scores), key=lambda x: x[-1])
def get_news_from_gid_file(fn_id, fn_out): gids = [] with open(fn_id) as fin: for line in fin: gids.append(int(line.strip())) sql = "select id, title from ss_article_group where id=%s;" with open(fn_out, 'w') as fout: for gid in gids: groupdb_dal.execute(sql, gid) row = groupdb_dal.cursor.fetchone() if row == None or row['title']==None: continue if len(row['title']) == 0: continue cut_data = text_cutter.process({'title': row['title'].encode('utf8')}) row['cut_title'] = cut_data['cut_title'].decode('utf8') print >> fout, json.dumps(row, ensure_ascii=False).encode('utf8')
def get_title_for_query_file(fn_combine, fn_exist_titles_list, fn_out): gids = set() for fn_title in fn_exist_titles_list: with open(fn_title) as fin: for line in fin: try: gid = int(line.split('\t')[0]) gids.add(gid) except ValueError as e: print e print line.strip() sql = "select id, title from ss_article_group where id=%s;" lno = 0 with open(fn_out, 'w') as fout: with open(fn_combine) as fin: for line in fin: if lno % 1000 == 0: sys.stdout.write('process to %d\r' % lno) sys.stdout.flush() lno += 1 data = json.loads(line, encoding='utf8') for c, r in data['title']: if c not in gids: groupdb_dal.execute(sql, c) row = groupdb_dal.cursor.fetchone() if row == None or row['title'] == None: continue if len(row['title']) == 0: continue cut_data = text_cutter.process( {'title': row['title'].encode('utf8')}) row['cut_title'] = cut_data['cut_title'].decode('utf8') row['cut_title'] = row['cut_title'].replace( '\n', '').replace('\r', '') print >> fout, ("%s\t%s" % (c, row['cut_title'])).encode('utf8') gids.add(c)
def get_title_for_query_file(fn_combine, fn_exist_titles_list, fn_out): gids = set() for fn_title in fn_exist_titles_list: with open(fn_title) as fin: for line in fin: try: data = json.loads(line, encoding='utf8') gid = data['id'] gids.add(gid) except ValueError as e: print e print line.strip() sql = "select id, title from ss_article_group where id=%s;" lno = 0 with open(fn_out, 'w') as fout: with open(fn_combine) as fin: for line in fin: if lno % 1000 == 0: sys.stdout.write('process to %d\r' % lno) sys.stdout.flush() lno += 1 data = json.loads(line, encoding='utf8') for c, r in data['title']: if c not in gids: groupdb_dal.execute(sql, c) row = groupdb_dal.cursor.fetchone() if row == None or row['title']==None: continue if len(row['title']) == 0: continue cut_data = text_cutter.process({'title': row['title'].encode('utf8')}) row['cut_title'] = cut_data['cut_title'].decode('utf8') row['cut_title'] = row['cut_title'].replace('\n', '').replace('\r', '') # print >> fout, ("%s\t%s" % (c, row['cut_title'])).encode('utf8') print >> fout, json.dumps(row, ensure_ascii=False).encode('utf8') gids.add(c)
def get_answers(self, query, k): if isinstance(query, unicode): query = query.encode('utf8') cut_data = text_cutter.process({'title': query}) cut_query = cut_data['cut_title'].decode('utf8') vecs, valid_queries = self.model.get_query_vec([cut_query]) if len(valid_queries)==0: return [] vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True)) vec = list(vecs)[0] # recall titles according to cosine similarity candidate_titles_index, scores = self.title_ann.get_nns_by_vector(vec, n=k*10, include_distances=True) # rank candidate titles using model candidate_titles = [] for i in candidate_titles_index: candidate_titles.append(self.titles[i]) ranks = self.model.rank_titles(cut_query, candidate_titles)[:k] return ranks