def get_data(filename, key_fields): data = {} if not os.path.exists(filename): return data row_num = get_row_num(filename) with codecs.open(filename, encoding='utf8') as f: for line_no, line in enumerate(f): if line_no % 100000 == 0: logging.info('finished: %s/%s', line_no, row_num) row = json.loads(line) key = '\0'.join([unicode(row[field]) for field in key_fields]) data[key] = row return data
def proc_file(file_in, file_ot): row_num = get_row_num(file_in) wf = codecs.open(file_ot, 'w', encoding='utf8') with codecs.open(file_in, encoding='utf8') as f: for line_no, line in enumerate(f): if line_no % 100 == 0: logging.info('finished: %s/%s', line_no, row_num) row = json.loads(line) word = row['word'] word_num = get_word_num(word) new_row = {'word': word, 'num': word_num} wf.write( json.dumps(new_row, ensure_ascii=False, sort_keys=True) + '\n') wf.close()
def get_actions(file_in, es_index, doc_type): actions = [] row_num = get_row_num(file_in) with codecs.open(file_in, encoding='utf8') as f: for line_no, line in enumerate(f): if line_no % 10000 == 0 and actions: upload(actions) actions = [] row = json.loads(line) _op_type = row.pop('_op_type') if _op_type == 'same': continue _id = hashlib.md5( (row['input'] + row['output']).encode('utf8')).hexdigest() action = { '_op_type': _op_type, '_index': es_index, '_type': doc_type, '_id': _id } suggest_field = doc_type doc = { suggest_field: { 'input': row.pop('input'), 'weight': row.pop('weight'), 'output': row.pop('output'), 'payload': { 'record_id': _id, '_ut': datetime.now() } } } doc[suggest_field]['payload'].update(row) if _op_type == 'index': action['_source'] = doc elif _op_type == 'update': action['doc'] = doc actions.append(action) return actions
def proc_file(self, file_in, file_ot): if not os.path.exists(file_in): raise Exception('file[%s] not exist.', file_in) row_num = get_row_num(file_in) with codecs.open(file_ot, 'w', encoding='utf8') as wf: with codecs.open(file_in, encoding='utf8') as f: for row_no, line in enumerate(f): if row_no % 100 == 0: logging.info('%d/%d', row_no, row_num) items = line.strip('\r\n').split('\0') if len(items) != 2: logging.warning('格式不对:%s', line.strip('\r\n')) word = items[0] weight = int(items[1]) lst_new_word_weight = self.extend_word_weight(word, weight) for new_word, new_weight in lst_new_word_weight: new_line = '\0'.join([new_word, word, str(new_weight)]) wf.write(new_line + '\n') logging.info('proc finished!')
def get_word_weight(self, file_in): self.word_weight = {} row_num = get_row_num(file_in) with codecs.open(file_in, encoding='utf8') as f: for line_no, line in enumerate(f): if line_no % 10000 == 0: logging.info('finished: %s, %s', line_no, row_num) try: row = json.loads(line) except Exception, e: logging.error(line_no) logging.error(line) raise Exception(e) category_weight = self.category_weight.get(row['category']) if category_weight is None: continue weight = MAX_WEIGHT - category_weight text = self.get_clean_text(row['value']) # common words = self.get_words(text) for word in words: self.add_word_weight(word, weight) # course_name if row['category'] == 'course_name': words = [ word.strip() for word in re.split(PUNCTUATIONS, text) ] for word in words: self.add_word_weight(word, MAX_WEIGHT) # paper_words paper_words = self.paper_trie.get_all_match(text) for word, _ in paper_words.items(): self.add_word_weight(word, weight)
def dump_pinyin_weight(file_in, file_ot, args): wf = codecs.open(file_ot, 'w', encoding='utf8') row_num = get_row_num(file_in) with codecs.open(file_in, encoding='utf8') as f: for line_no, line in enumerate(f): if line_no % 10000 == 0: logging.info('finished: %s/%s', line_no, row_num) try: row = json.loads(line) except Exception, e: print e print line_no, line raise Exception(e) word = row['input'] generator = PinyinGenerator(word) lst_pinyin_weight = [] try: if args.FULL_PINYIN: new_weight = row['weight'] - 1 lst_pinyin_weight.append( (new_weight, [''.join(x) for x in generator.pinyins])) if args.FIRST_LETTER: new_weight = row['weight'] - 2 lst_pinyin_weight.append( (new_weight, [''.join(x) for x in generator.first_letters])) if args.INITIAL: new_weight = row['weight'] - 3 lst_pinyin_weight.append( (new_weight, [''.join(x) for x in generator.initials])) if args.FUZZY_PINYIN: new_weight = row['weight'] / 2 lst_pinyin_weight.append( (new_weight, [''.join(x) for x in generator.fuzzy_pinyins])) all_pinyins = set() for weight, pinyins in lst_pinyin_weight: all_pinyins |= set(pinyins) if len(all_pinyins) < MAX_MIX_NUM and len(word) < 6: if args.MIX_PINYIN_WITH_CHINESE: new_weight = row['weight'] / 2 - 100 lst_pinyin_weight.append((new_weight, [ ''.join(x) for x in generator.mix_pinyins_with_chinese ])) elif args.MIX_PINYIN: new_weight = row['weight'] / 2 - 100 lst_pinyin_weight.append( (new_weight, [''.join(x) for x in generator.mix_pinyins])) except Exception, e: print line_no, line raise Exception(e) new_input_weight = defaultdict(int) for weight, pinyins in lst_pinyin_weight: for pinyin in pinyins: if weight > new_input_weight[pinyin]: new_input_weight[pinyin] = weight for _input, weight in new_input_weight.items(): if len(_input) <= MAX_WORD_LEN and len(_input) >= MIN_WORD_LEN: new_row = copy.deepcopy(row) new_row['input'] = _input new_row['weight'] = weight wf.write( json.dumps(new_row, sort_keys=True, ensure_ascii=False) + '\n')