예제 #1
0
def get_data(filename, key_fields):
    data = {}
    if not os.path.exists(filename):
        return data
    row_num = get_row_num(filename)
    with codecs.open(filename, encoding='utf8') as f:
        for line_no, line in enumerate(f):
            if line_no % 100000 == 0:
                logging.info('finished: %s/%s', line_no, row_num)
            row = json.loads(line)
            key = '\0'.join([unicode(row[field]) for field in key_fields])
            data[key] = row
    return data
예제 #2
0
def proc_file(file_in, file_ot):
    row_num = get_row_num(file_in)
    wf = codecs.open(file_ot, 'w', encoding='utf8')
    with codecs.open(file_in, encoding='utf8') as f:
        for line_no, line in enumerate(f):
            if line_no % 100 == 0:
                logging.info('finished: %s/%s', line_no, row_num)
            row = json.loads(line)
            word = row['word']
            word_num = get_word_num(word)
            new_row = {'word': word, 'num': word_num}
            wf.write(
                json.dumps(new_row, ensure_ascii=False, sort_keys=True) + '\n')
    wf.close()
예제 #3
0
def get_actions(file_in, es_index, doc_type):
    actions = []
    row_num = get_row_num(file_in)
    with codecs.open(file_in, encoding='utf8') as f:
        for line_no, line in enumerate(f):
            if line_no % 10000 == 0 and actions:
                upload(actions)
                actions = []
            row = json.loads(line)

            _op_type = row.pop('_op_type')
            if _op_type == 'same':
                continue

            _id = hashlib.md5(
                (row['input'] + row['output']).encode('utf8')).hexdigest()

            action = {
                '_op_type': _op_type,
                '_index': es_index,
                '_type': doc_type,
                '_id': _id
            }

            suggest_field = doc_type

            doc = {
                suggest_field: {
                    'input': row.pop('input'),
                    'weight': row.pop('weight'),
                    'output': row.pop('output'),
                    'payload': {
                        'record_id': _id,
                        '_ut': datetime.now()
                    }
                }
            }
            doc[suggest_field]['payload'].update(row)

            if _op_type == 'index':
                action['_source'] = doc
            elif _op_type == 'update':
                action['doc'] = doc

            actions.append(action)

    return actions
예제 #4
0
    def proc_file(self, file_in, file_ot):
        if not os.path.exists(file_in):
            raise Exception('file[%s] not exist.', file_in)

        row_num = get_row_num(file_in)
        with codecs.open(file_ot, 'w', encoding='utf8') as wf:
            with codecs.open(file_in, encoding='utf8') as f:
                for row_no, line in enumerate(f):
                    if row_no % 100 == 0:
                        logging.info('%d/%d', row_no, row_num)
                    items = line.strip('\r\n').split('\0')
                    if len(items) != 2:
                        logging.warning('格式不对:%s', line.strip('\r\n'))
                    word = items[0]
                    weight = int(items[1])
                    lst_new_word_weight = self.extend_word_weight(word, weight)
                    for new_word, new_weight in lst_new_word_weight:
                        new_line = '\0'.join([new_word, word, str(new_weight)])
                        wf.write(new_line + '\n')

        logging.info('proc finished!')
예제 #5
0
    def get_word_weight(self, file_in):
        self.word_weight = {}

        row_num = get_row_num(file_in)
        with codecs.open(file_in, encoding='utf8') as f:
            for line_no, line in enumerate(f):
                if line_no % 10000 == 0:
                    logging.info('finished: %s, %s', line_no, row_num)
                try:
                    row = json.loads(line)
                except Exception, e:
                    logging.error(line_no)
                    logging.error(line)
                    raise Exception(e)
                category_weight = self.category_weight.get(row['category'])
                if category_weight is None:
                    continue
                weight = MAX_WEIGHT - category_weight

                text = self.get_clean_text(row['value'])

                # common
                words = self.get_words(text)
                for word in words:
                    self.add_word_weight(word, weight)

                # course_name
                if row['category'] == 'course_name':
                    words = [
                        word.strip() for word in re.split(PUNCTUATIONS, text)
                    ]
                    for word in words:
                        self.add_word_weight(word, MAX_WEIGHT)

                # paper_words
                paper_words = self.paper_trie.get_all_match(text)
                for word, _ in paper_words.items():
                    self.add_word_weight(word, weight)
예제 #6
0
def dump_pinyin_weight(file_in, file_ot, args):
    wf = codecs.open(file_ot, 'w', encoding='utf8')
    row_num = get_row_num(file_in)
    with codecs.open(file_in, encoding='utf8') as f:
        for line_no, line in enumerate(f):
            if line_no % 10000 == 0:
                logging.info('finished: %s/%s', line_no, row_num)
            try:
                row = json.loads(line)
            except Exception, e:
                print e
                print line_no, line
                raise Exception(e)
            word = row['input']
            generator = PinyinGenerator(word)

            lst_pinyin_weight = []
            try:
                if args.FULL_PINYIN:
                    new_weight = row['weight'] - 1
                    lst_pinyin_weight.append(
                        (new_weight, [''.join(x) for x in generator.pinyins]))
                if args.FIRST_LETTER:
                    new_weight = row['weight'] - 2
                    lst_pinyin_weight.append(
                        (new_weight,
                         [''.join(x) for x in generator.first_letters]))
                if args.INITIAL:
                    new_weight = row['weight'] - 3
                    lst_pinyin_weight.append(
                        (new_weight, [''.join(x) for x in generator.initials]))
                if args.FUZZY_PINYIN:
                    new_weight = row['weight'] / 2
                    lst_pinyin_weight.append(
                        (new_weight,
                         [''.join(x) for x in generator.fuzzy_pinyins]))
                all_pinyins = set()
                for weight, pinyins in lst_pinyin_weight:
                    all_pinyins |= set(pinyins)

                if len(all_pinyins) < MAX_MIX_NUM and len(word) < 6:
                    if args.MIX_PINYIN_WITH_CHINESE:
                        new_weight = row['weight'] / 2 - 100
                        lst_pinyin_weight.append((new_weight, [
                            ''.join(x)
                            for x in generator.mix_pinyins_with_chinese
                        ]))
                    elif args.MIX_PINYIN:
                        new_weight = row['weight'] / 2 - 100
                        lst_pinyin_weight.append(
                            (new_weight,
                             [''.join(x) for x in generator.mix_pinyins]))
            except Exception, e:
                print line_no, line
                raise Exception(e)

            new_input_weight = defaultdict(int)
            for weight, pinyins in lst_pinyin_weight:
                for pinyin in pinyins:
                    if weight > new_input_weight[pinyin]:
                        new_input_weight[pinyin] = weight

            for _input, weight in new_input_weight.items():
                if len(_input) <= MAX_WORD_LEN and len(_input) >= MIN_WORD_LEN:
                    new_row = copy.deepcopy(row)
                    new_row['input'] = _input
                    new_row['weight'] = weight
                    wf.write(
                        json.dumps(new_row, sort_keys=True, ensure_ascii=False)
                        + '\n')