def __init__(self, template_file): self._rules = [] for line in iter_file(template_file): if not line: continue template = line.split('#')[0].strip() if not template: continue groups = re.findall(r'^(.+)\((.+)_(.+), (.+)_(.+)\)$', template) if groups: rule, token1_pos, token1_type, token2_pos, token2_type = groups[ 0] if token1_type == 'm': modifier_order = 1 token_order = 2 else: modifier_order = 2 token_order = 1 rule_format = '%s(%s, %s)' % (rule, token1_pos, token2_pos) self._rules.append((rule_format, modifier_order, token_order))
def __iter__(self): for line in utils.iter_file(self.__source): yield [ tp[0] for tp in re.findall(r'(\S+)%s(\S+)' % WORD_POS_SEPARATOR, line) if tp[1] != 'PU' ]
def test_label_eval(self): self.assertTrue(True) import re import os from nlp.config import RESOURCE_DIR from common.utils import iter_file total = 0 extract = 0 right = 0 for line in iter_file( os.path.join(RESOURCE_DIR, 'tmp', 'labels.result.txt')): if not line: continue xx = re.findall(r'\[(\d) (\d) (\d)\]', line) if xx: print(xx) nums = xx[0] total += int(nums[0]) extract += int(nums[1]) right += int(nums[2]) print('total: {}, extract: {}, right: {}'.format( total, extract, right)) print('准确率:{}, 召回率:{}'.format(1.0 * right / extract, 1.0 * right / total))
def clean_file(source_file, dest_file): logger.info('clean pinglun run...') with codecs.open(dest_file, 'w', encoding='utf-8') as f: for line in iter_file(source_file): for sent in clean_txt(line): f.write('%s\n' % sent)
def test_labelExtractor_batch(self): self.assertTrue(True) feature_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet', '_result', 'features.revised') opinion_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet', '_result', 'opinions.revised') label_extractor = LabelExtractor(feature_file, opinion_file, sentence_prob_threshold=-10) ''' labels = label_extractor.extract_from_txt(txt) for label in labels: print(label) ''' counter = Counter() results = [] comment_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet', 'sbracelet.txt') for i, line in enumerate(utils.iter_file(comment_file)): print(i) if i > 100: break # 句法解析 txts = clean.clean_txt2(line) relations = [] for txt in txts: sentences = label_extractor.preprocess(txt) for sentence in sentences: sent = parser.parse2sents(sentence)[0] relation = ' '.join([str(r) for r in sent.relations]) relations.append(relation) # 提取标签 labels = label_extractor.extract_from_txt(line) for label in labels: fo = label.feature + label.opinion counter.update([fo]) # print(line, '->', labels) results.append(line) results.append('->') results += relations results.append('->') for label in labels: results.append(str(label)) results.append('') utils.write_file( os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet', 'labels.result.txt'), results) for fo, c in counter.most_common(): print(fo, c)
def test_general_polar(self): self.assertTrue(True) opinion_file = os.path.join(RESOURCE_DIR, 'dp', 'dp.opinions') for word in utils.iter_file(opinion_file): polar = lexicon.get_polar(word) if polar == 'x': print(word, polar)
def __init__(self, template_file): self._rules = [] for line in iter_file(template_file): if not line: continue template = line.split('#')[0].strip() self._rules.append(template)
def test_x1(self): self.assertTrue(True) lines = [] for i, line in enumerate(utils.iter_file(os.path.join(RESOURCE_DIR, 'mobile', 'std.txt'))): if i < 50000: lines.append(line) utils.write_file(os.path.join(RESOURCE_DIR, 'mobile', 'std.5w.txt'), lines)
def build_lm_train_data(raw_data_file, hanzi_data_file, pnyin_data_file): """ 构建Language Model训练语料 :param raw_data_file: :param hanzi_data_file: :param pnyin_data_file: """ SYMBOL_ENG = '<eng>' SYMBOL_NUM = '<num>' SYMBOL_ENG_NUM = '<engnum>' with codecs.open(hanzi_data_file, mode='w', encoding='utf-8') as hf,\ codecs.open(pnyin_data_file, mode='w', encoding='utf-8') as pf: j = 0 for i, line in enumerate(iter_file(raw_data_file)): # if i % 10000 == 0: # print(i) if j > 24713125: break for sent in re.split(r'[,。?!?,]', line): tokens = tag_pinyin(sent) words = [] pnyins = [] for tp in tokens: word = tp[0] pnyin = tp[1] if re.match(r'^[a-zA-Z]+$', word): word = SYMBOL_ENG if re.match(r'^[0-9]+$', word): word = SYMBOL_NUM if re.match(r'^[a-zA-Z0-9]+$', word): word = SYMBOL_ENG_NUM words.append(word) pnyin = pnyin if pnyin else word pnyins.append(pnyin) # if words: # hf.write('{}\n'.format(' '.join(words))) if pnyins: j += 1 if j % 10000 == 0: print(j) pf.write('{}\n'.format(' '.join(pnyins)))
def test_count_syntax(self): self.assertTrue(True) sentiments = load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1正面评价词_a+.txt')) sentiments |= load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1负面评价词_a-.txt')) features = load_feature_word(os.path.join(RESOURCE_DIR, 'mobile', 'mobile.ontology')) corpus_file = os.path.join(RESOURCE_DIR, 'mobile', 'std.txt') ff_counter = Counter() oo_counter = Counter() fo_counter = Counter() ff_samples = defaultdict(set) oo_samples = defaultdict(set) fo_samples = defaultdict(set) i = 0 for line in utils.iter_file(corpus_file): i += 1 if i % 100 == 0: print(i) if i > 200000: break for sent in parser.parse2sents(line): for relation in sent.relations: token1 = relation.token1.word token2 = relation.token2.word if token1 in features and token2 in features: ff_counter.update([relation.format]) ff_samples[relation.format].add(str(relation)) if token1 in sentiments and token2 in sentiments: oo_counter.update([relation.format]) oo_samples[relation.format].add(str(relation)) if token1 in sentiments and token2 in features: fo_counter.update([relation.format]) fo_samples[relation.format].add(str(relation)) if token1 in features and token2 in sentiments: fo_counter.update([relation.format]) fo_samples[relation.format].add(str(relation)) utils.save_obj(ff_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.counter')) utils.save_obj(oo_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.counter')) utils.save_obj(fo_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.counter')) utils.save_obj(ff_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.dict')) utils.save_obj(oo_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.dict')) utils.save_obj(fo_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.dict'))
def iter_corpus(pdict, base_dir_): for f in os.listdir(base_dir_): f = os.path.join(base_dir_, f) if os.path.isdir(f): iter_corpus(pdict, f) else: for line in iter_file(f): for w, p in re.findall(pattern, line): p = PosNormalizer.normalize(p) pdict[w][p] += 1
def iter_corpus(poses_, counts_, base_dir_): for f in os.listdir(base_dir_): f = os.path.join(base_dir_, f) if os.path.isdir(f): iter_corpus(poses_, f) else: for line in iter_file(f): for w, p in re.findall(pattern, line): poses_[w].add(p) counts_[w] += 1
def __init__(self, lexicon_file): degrees = defaultdict(set) __current_degree = '' for word in utils.iter_file(lexicon_file): if word.startswith('['): word = word.replace('[', '').replace(']', '') __current_degree = word degrees[__current_degree].add(word) self._degrees = degrees
def extract_from_file(self, txt_file): """ 提取标签,输入源是一个文本文件 :param txt_file: :return: [Label, Label, ...] """ labels = [] for line in utils.iter_file(txt_file): labels += self.extract_from_txt(line) return labels
def test_compare_pos(self): self.assertTrue(True) wd = dict() jieba_vocab_file = 'D:\\soft\\anaconda2\\envs\\clabel\\Lib\\site-packages\\jieba\\dict.txt' for line in iter_file(jieba_vocab_file): word, freq, tag = line.split() wd[word] = tag jieba_user1_vocab_file = 'D:\\workspace\\pycharm\\clabel\\zresource\\nlp\lexicon\\jieba\\user1.dict' for line in iter_file(jieba_user1_vocab_file): word, freq, tag = line.split() wd[word] = tag ds = degreeLexicon.items adjs = fixedSentimentLexicon.items print('--------------degree-------------------') for d in ds: if d in wd and wd[d] != 'd': # print(d, wd[d]) pass print('--------------adj-------------------') for a in adjs: if a not in wd: # print('-miss- ', a, 10000, 'a') pass elif wd[a] != 'a' and wd[a][0] in ['n', 'i']: # print(a, 10000, 'a') pass print('--------------d + adj-------------------') for d, a in product(ds, adjs): x = d + a if x in wd and wd[x] != 'a': print(x, wd[x]) pass
def create_standard_dataset(self): """ 读取原始的文本,清洗后,提取出规范句子,存储到文件 :return: """ sentences = [] for line in iter_file(self._raw_file): txt = clean.clean_txt(line) sents = clean.extract_standard_sentences(txt) sentences += [sent for sent in sents if clean.is_meaningful(sent)] write_file(self._clean_file, sentences)
def _build_train_data(self): VOCAB_SIZE = len(self._ctable) MAX_SENTENCE_LENGTH = self._max_sentence_length SAMPLE_NUMBER = 0 for line in iter_file(self._label_file): if line: SAMPLE_NUMBER += 1 logger.info('Vectorization...') X = np.zeros((SAMPLE_NUMBER, MAX_SENTENCE_LENGTH, VOCAB_SIZE), dtype=np.int8) y = np.zeros((SAMPLE_NUMBER, MAX_SENTENCE_LENGTH, VOCAB_SIZE), dtype=np.int8) for i, line in enumerate(iter_file(self._label_file)): if not line: continue sentence, sequence = line.split('\t') X[i] = self._ctable.encode(sentence.split(), MAX_SENTENCE_LENGTH) y[i] = self._ctable.encode(sequence.split(), MAX_SENTENCE_LENGTH) logger.info('Shuffle...') indices = np.arange(len(y)) np.random.shuffle(indices) X = X[indices] y = y[indices] # Explicitly set apart 10% for validation data that we never train over. # split_at = len(X) - len(X) // 10 # (X_train, x_val) = X[:split_at], X[split_at:] # (y_train, y_val) = y[:split_at], y[split_at:] X_train, y_train = X, y return X_train, y_train
def test_normalize_revise_file(self): self.assertTrue(True) import html.parser html_parser = html.parser.HTMLParser() def tokens2str(tokens): return ' '.join( ['%s/%s' % (token.word, token.pos) for token in tokens]) ss = [] sb_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet', 'sbracelet.txt') for i, line in enumerate(iter_file(sb_file)): print(i) # 还原html转义字符,… => …… line = html_parser.unescape(line) # Todo HTMLParser不能讲'转换,而单独测试时是可以的,不知为何。。 line = line.replace(''', '\'') for sentence in combParser.ssplit(line): tokens = combParser.pos(sentence, revise=False) s1 = tokens2str(tokens) # ss.append('jba1- ' + s1) tokens = combParser.pos(sentence, revise=True) s2 = tokens2str(tokens) # ss.append('jba2- ' + s2) if s1 != s2: ss.append('jba1- ' + s1) ss.append('jba2- ' + s2) # tokens = ltpParser.pos(line) # ss.append('ltp1- ' + tokens2str(tokens)) # # PosReviser.revise(tokens) # ss.append('ltp2- ' + tokens2str(tokens)) if i > 1000: break write_file( os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet', 'sbracelet.pos.1.txt'), ss)
def load_feature_word(file_path): words = set() for line in utils.iter_file(file_path): line = line.strip() if not line: continue groups = re.findall(r'^(\S+) \S+ \S+ \S+ \[([^\[\]]*)\].*$', line)[0] words.add(groups[0]) for word in groups[1].split(): if word: words.add(word) return words
def _init_revise_map(pos_revise_file): revise_map = dict() for line in iter_file(pos_revise_file): word = line.split(' ')[0] # 过滤掉出现次数过低的词性 pcs = { p: int(c) for p, c in re.findall(r'([a-zA-Z]+)=(\d+)', line) if int(c) > 1 } total = sum(pcs.values()) revise_map[word] = {p: 1.0 * c / total for p, c in pcs.items()} revise_map = {w: s for w, s in revise_map.items() if len(s) > 1} return revise_map
def create_train_dataset(self): """ 标注训练数据,分字->标注 <E>表示句子的最后一个字,<M>表示句子的非最后一个字 特别好,发货很快,赞。 => <M> <M> <E> <M> <M> <M> <E> <E> 。 :return: """ lines = [] for line in iter_file(self._clean_file): result = Labeler.label(line) token = ' '.join([t for t, _ in result]) sequence = ' '.join([seq for _, seq in result]) lines.append('%s\t%s' % (token, sequence)) write_file(self._label_file, lines)
def test_sbd_file(self): self.assertTrue(True) model = SBDModel.load( keras_model_file=os.path.join(APP_RESOURCE_DIR, 'sbd.keras.model')) lines = [] for line in iter_file( os.path.join(RESOURCE_DIR, 'tmp', 'comment.mobile.txt')): words = re.findall(r'[a-zA-Z0-9\u4e00-\u9fa5]', line) sent = ''.join(words) # sequence = model.predict_sequence(sent) pline = model.predict_txt(sent) lines.append('{} -> {}'.format(line, pline)) print('{} -> {}'.format(line, pline)) write_file(os.path.join(RESOURCE_DIR, 'tmp', 'sbd.result.txt'), lines)
def load_sentiment_words(file_path): words = set() for line in utils.iter_file(file_path): line = line.strip() if not line or line.startswith('----'): continue for word in line.split(): word = word.strip() if not word: continue word = re.sub(r'\(\d+\)', '', word) words.add(word) return words
def __init__(self, vocab_file): self._pinyin2chars = defaultdict(set) self._counter = Counter() self._char2pinyin = dict() i = 0 for line in iter_file(vocab_file): i += 1 if i % 10000 == 0: print(i) if i > 1000000: break for c, p in tag_pinyin(line): self._pinyin2chars[p].add(c) self._char2pinyin[c] = p self._counter.update([c])
def test_sbd_eval(self): self.assertTrue(True) t = 0 for line in iter_file( os.path.join(RESOURCE_DIR, 'tmp', 'sbd.result.txt')): txt1, txt2 = line.split('->') txt1 = txt1.strip() txt2 = txt2.strip() sents1 = parser.ssplit(txt1) sents2 = parser.ssplit(txt2) if sents1 == sents2: t += 1 else: print(line) print('true:', t)
def find_by_rule(corpus_file, reg_rules, max_lines): counters = [] for _ in range(len(reg_rules)): counters.append(Counter()) i = 0 for line in utils.iter_file(corpus_file): i += 1 if i % 100 == 0: print(i) if i > max_lines: break txt = ' '.join(parser.segment(line)) for counter, reg_rule in zip(counters, reg_rules): groups = re.findall(reg_rule, txt) if groups: word = groups[0] # # 过滤掉单字,单字词性往往不固定,不好确定 # if len(word) < 2: # continue # 过滤掉标点符号 if word in [',', '。', '!', '!', '?', '?']: continue # 过滤掉纯数字 if re.match(r'\d+', word): continue # 过滤掉无关词 if degreeLexicon.is_degree( word) or irrelevantLexicon.is_irrelevant_word(word): continue counter.update([word]) return counters
def run_test(): lm = BaseLM(os.path.join(LM_MODEL_DIR, 'hanzi.arpa')) from common.utils import iter_file from common.utils import write_file probs = [] for line in iter_file(os.path.join(RESOURCE_DIR, 'tmp', 'comment.test.txt')): for sent in re.split(r'[,。?!?,]', line): words = re.findall(r'[a-zA-Z0-9\u4e00-\u9fa5]', sent) sent = ''.join(words) if sent: prob = lm.predict_prob(sent) probs.append((sent, prob)) sort_probs = sorted(probs, key=lambda tp: tp[1]) write_file(os.path.join(RESOURCE_DIR, 'tmp', 'result.txt'), ['{} {}'.format(p, s) for s, p in sort_probs])
def test_correct2(self): self.assertTrue(True) txts = [] for i, line in enumerate(iter_file(os.path.join(RESOURCE_DIR, 'tmp', 'comment.mobile.tiny.txt'))): if i % 100 == 0: print(i) txt = std.extract_txt(line) sents = [] for sent in parser.ssplit(line): # 提取中文、英文、数字 sent = std.extract_txt(sent) if not sent: continue '''纠错,只对中文纠错''' if not re.findall(r'[a-zA-Z0-9]', sent): csent = std.wed(sent) if sent != csent: sent_prob = std.prob(sent) csent_prob = std.prob(csent) # 新文本的概率大于旧文本,即纠错 if csent_prob > sent_prob: sent = csent sents.append(sent) ctxt = ''.join(sents) if ctxt != txt: txts.append('{} -> {}'.format(txt, ctxt)) write_file(os.path.join(RESOURCE_DIR, 'tmp', 'correct.result.txt'), txts)
def build(cls, corpus_file, special_chars=[]): ctable = CharacterTable() chars = set() for line in iter_file(corpus_file): chars |= set(Tokenizer.token(line)) chars.add(' ') if special_chars: for sc in special_chars: chars.add(sc) # TODO UNK WORD chars.add(CharacterTable.SYMBOL_UNK) ctable.chars = sorted(chars) ctable.char_indices = dict((c, i) for i, c in enumerate(ctable.chars)) ctable.indices_char = dict((i, c) for i, c in enumerate(ctable.chars)) return ctable
def build_train_data(self): histories = [] next_chars = [] # 对语料文本进行分字、标注拼音 for line in iter_file(self._corpus_file): words, pinyins = self.segment_pinyin_txt(line) histories_, next_chars_ = self.build_history_nextchars(words) histories += histories_ next_chars += next_chars_ X = np.zeros((len(histories), self._maxlen, len(self._chars)), dtype=np.bool) y = np.zeros((len(histories), len(self._chars)), dtype=np.bool) for i, history in enumerate(histories): for t, char in enumerate(history): X[i, t, self.vocab_char2idx(char)] = 1 y[i, self.vocab_char2idx(next_chars[i])] = 1 return X, y