def process_batch_preana(batch): sequences = [] for index, paragraph in batch: for sentence in paragraph: sequence = [] for token in sentence: sample = Sample() sequence.append(sample) sample.features['token'] = token.form # print(interpretations) sample.features['tags'] = uniq( [form.tags for form in token.interpretations]) sample.features['maca_lemmas'] = uniq([ (form.lemma, form.tags) for form in token.interpretations ]) sample.features['space_before'] = [ 'space_before' ] if token.space_before else ['no_space_before'] Preprocess.create_features(sequence) if sequence: # print(len(sequence)) yield sequence
def process_batch(documents: Iterable[str], maca_config: str, toki_config_path: str) -> Generator[ List[Sample], None, None]: maca_analyzer = MacaAnalyzer(maca_config, toki_config_path) for document_id, document in enumerate(documents): results = maca_analyzer._maca(document) for res in results: result = maca_analyzer._parse(res) sequence = [] for form, space_before, interpretations, start, end in result: sample = Sample() sequence.append(sample) sample.features['token'] = form sample.features['tags'] = uniq([t for l, t in interpretations]) interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in interpretations] sample.features['maca_lemmas'] = [(l.replace('_', ' '), t) for l, t in uniq(interpretations)] # TODO: cleanup space before sample.features['space_before'] = ['space_before'] if space_before !='none' else [ 'no_space_before'] sample.features['space_before'].append(space_before) sample.features['start'] = start sample.features['end'] = end sample.features['document_id'] = document_id Preprocess.create_features(sequence) if sequence: yield sequence
def process_batch(batch, maca_config, toki_config_path): #indexes=[] batchC = [] for index, line in batch: #indexes.append(i) batchC.append(line) results = Preprocess.maca(batchC, maca_config, toki_config_path) #self.log('MACA') #print('MACA', len(results), file=sys.stderr) sequences = [] for res in results: result = Preprocess.parse(res) # TODO cechy sequence = [] for form, space_before, interpretations in result: sample = Sample() sequence.append(sample) sample.features['token'] = form # print(interpretations) sample.features['tags'] = uniq([t for l, t in interpretations]) sample.features['maca_lemmas'] = interpretations sample.features['space_before'] = [ 'space_before' ] if space_before == 'space' else ['no_space_before'] Preprocess.create_features(sequence) if sequence: yield sequence
def load_dataset(input_file, word_id=0, word_to_id={}, update_word_ids=True, mode='memnn'): #dataset = [] dataset_ids = [] #labels = [] label_ids = [] with open(input_file) as f: article = {} article_no = 0 for line in f: line = line.strip() if len(line) > 0 and line[:2] == '1 ' and len(dataset_ids) > 0: # new article article = {} article_no += 1 if '\t' in line: # question question_parts = line.split('\t') tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split() if update_word_ids: for token in tokens[1:]: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 if question_parts[1] not in word_to_id: word_to_id[question_parts[1]] = word_id word_id += 1 stmt_ids = map(int, question_parts[2].strip().split()) sequence = [] if mode == 'baseline': for s in range(int(tokens[0])): if s in article: sequence += article[s] else: for s in stmt_ids: sequence += article[s] for token in tokens[1:]: sequence.append(token) if article_no == 0: print("seq: %s | label: %s" % (' '.join(sequence).ljust(70), question_parts[1])) dataset_ids.append(map(lambda t: word_to_id[t], sequence)) label_ids.append(word_to_id[question_parts[1]]) else: # statement tokens = re.sub(r'([\.\?])$', r' \1', line).split() if update_word_ids: for token in tokens[1:]: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 line_no = int(tokens[0]) article[line_no] = [] for token in tokens[1:]: article[line_no].append(token) return dataset_ids, label_ids, word_to_id, word_id
def get_data(data_path, word_dict=None, label_dict=None, mode=None): # 处理词向量数据 if mode == 'vec': words_vec = [] with open(data_path, encoding='utf-8') as fr: lines = fr.readlines() for line in lines: if line != '\n': word_vec = line.strip().split()[1:] words_vec.append(word_vec) all_vec = list() for each in words_vec: each_vec = [] for char in each: each_vec.append(eval(char)) all_vec.append(each_vec) all_vec = np.asarray(all_vec) with open('./data/word_vec.pkl', 'wb') as fw: pickle.dump(all_vec, fw) return True # 建立词典 elif mode == 'vocab': word_list = list() with open(data_path, encoding='utf-8') as fr: lines = fr.readlines() for line in lines: if line != '\n': word_list.append(line.strip()) # 特殊词 special_word = ['pad', 'unknown'] # 对词典表进行汇总 word_list = special_word + word_list word_dict = dict() for key, value in enumerate(word_list): word_dict[value] = key return word_dict # 处理训练和测试数据 else: data, labels = [], [] with open(data_path, encoding='utf-8') as fr: lines = fr.readlines() sequence, tag = [], [] for line in lines: if line != '\n': [char, label] = line.strip().split() sequence.append(char) tag.append(label) else: sequence_ids = [ word_dict[char] if char in word_dict else word_dict['unknown'] for char in sequence ] tag_ids = [label_dict[label] for label in tag] data.append(sequence_ids) labels.append(tag_ids) sequence, tag = [], [] return data, labels
def deco2sentence(self, decoderOutputs): """Decode the output of the decoder and return a human friendly sentence decoderOutputs (list<np.array>): """ sequence = [] #Choose the words with the highest prediction score for out in decoderOutputs: sequence.append(np.argmax(out)) return sequence
def getStrokes(filename): tree = ET.parse(filename) root = tree.getroot() sequence = [] xnarray = [] ynarray = [] tnarray = [] xarray = [] yarray = [] tarray = [] parray = [] pointarray = [0] op = 0 for stroke in root[1].findall('Stroke'): # time=[float(stroke.attrib['end_time'])-float(stroke.attrib['start_time']),0,0] #time_offset= float(stroke.attrib['start_time']) points = 0 for point in stroke.findall('Point'): xarray.append(float(point.attrib['x'])) yarray.append(float(point.attrib['y'])) tarray.append(float(point.attrib['time'])) parray.append(1) points = points + 1 #points.append([float(point.attrib['x'])-x_offset,float(point.attrib['y'])-y_offset,float(point.attrib['time'])-time_offset]) op = op + points pointarray.append(op) parray[-1] = 0 parray[len(parray) - points] = 0 xnarray.append(xarray[0]) ynarray.append(yarray[0]) tnarray.append(tarray[0]) for i, j in zip(pointarray[:], pointarray[1:]): if (i != 0): xnarray.append(xarray[i] - xarray[i - 1]) ynarray.append(yarray[i] - yarray[i - 1]) tnarray.append(tarray[i] - tarray[i - 1]) for point in range(i + 1, j): xnarray.append(xarray[point] - xarray[i]) ynarray.append(yarray[point] - yarray[i]) tnarray.append(tarray[point] - tarray[i]) result = zip(tnarray, xnarray, ynarray, parray) sequence.append(result) #print (sequence) #print (sequence[][3][0],sequence[0][0][1],sequence[0][0][2],sequence[0][0][3]) return sequence
def getStrokes(filename): tree = ET.parse(filename) root = tree.getroot() sequence = [] xnarray=[] ynarray=[] tnarray=[] xarray=[] yarray=[] tarray=[] parray=[] pointarray=[0] op=0 for stroke in root[1].findall('Stroke'): points=0 for point in stroke.findall('Point'): xarray.append(float(point.attrib['x'])) yarray.append(float(point.attrib['y'])) tarray.append(float(point.attrib['time'])) parray.append(1) points=points+1 op=op+points pointarray.append(op) parray[-1]=0 parray[len(parray)-points]=0 xnarray.append(xarray[0]) ynarray.append(yarray[0]) tnarray.append(tarray[0]) for i,j in zip(pointarray[:],pointarray[1:]): if (i!=0): xnarray.append(abs(xarray[i]-xarray[i-1])) ynarray.append(abs(yarray[i]-yarray[i-1])) tnarray.append(abs(tarray[i]-tarray[i-1])) for point in range (i+1,j): xnarray.append(abs(xarray[point]-xarray[i])) ynarray.append(abs(yarray[point]-yarray[i])) tnarray.append(abs(tarray[point]-tarray[i])) xnarray[0]=0 ynarray[0]=0 tnarray[0]=0 result=zip(tnarray,xnarray,ynarray,parray) sequence.append(result) return sequence
def process_batch_preana(batch: Iterable[Paragraph]) -> Generator[List[Sample], None, None]: for document_id, paragraph in batch: for sentence in paragraph: sequence = [] for token in sentence: sample = Sample() sequence.append(sample) sample.features['token'] = token.form sample.features['tags'] = uniq([form.tags for form in token.interpretations]) sample.features['maca_lemmas'] = uniq([(form.lemma, form.tags) for form in token.interpretations]) sample.features['space_before'] = ['space_before'] if token.space_before else ['no_space_before'] sample.features['space_before'].append(token.space_before) sample.features['document_id'] = document_id Preprocess.create_features(sequence) if sequence: yield sequence
def preprocess_paragraph_reanalyzed( paragraph: Paragraph) -> List[Tuple[List[Sample], List[Sample]]]: paragraph_sequence = [] for sentence, sentence_gold in zip(paragraph, paragraph.concraft): valid_training_data = len(sentence_gold.tokens) == len( sentence.tokens) and len([ token.gold_form for token in sentence.tokens if token.gold_form is None ]) == 0 sequence = [] for token in sentence.tokens: sample = Sample() sequence.append(sample) sample.features['token'] = token.form sample.features['tags'] = uniq( map(lambda form: form.tags, token.interpretations)) if valid_training_data: sample.features['label'] = token.gold_form.tags sample.features['lemma'] = token.gold_form.lemma sample.features['space_before'] = [ 'space_before' ] if is_separator_before( token.space_before) else ['no_space_before'] sample.features['tags4e3'] = create_token_features( sample.features['token'], sample.features['tags'], sample.features['space_before']) sequence_gold = [] for token_gold in sentence_gold.tokens: sample = Sample() sequence_gold.append(sample) sample.features['token'] = token_gold.form if token_gold.gold_form is None: sample.features['label'] = 'ign' else: sample.features['label'] = token_gold.gold_form.tags sample.features['lemma'] = token_gold.gold_form.lemma sample.features['space_before'] = [ 'space_before' ] if is_separator_before( token_gold.space_before) else ['no_space_before'] paragraph_sequence.append((sequence, sequence_gold)) return paragraph_sequence
def preprocess_paragraph_preanalyzed( paragraph: Paragraph) -> List[Tuple[List[Sample], List[Sample]]]: paragraph_sequence = [] for sentence in paragraph: sequence = [] for token in sentence.tokens: sample = Sample() sequence.append(sample) sample.features['token'] = token.form sample.features['tags'] = uniq( map(lambda form: form.tags, token.interpretations)) sample.features['label'] = token.gold_form.tags sample.features['lemma'] = token.gold_form.lemma sample.features['space_before'] = [ 'space_before' ] if is_separator_before( token.space_before) else ['no_space_before'] sample.features['tags4e3'] = create_token_features( sample.features['token'], sample.features['tags'], sample.features['space_before']) paragraph_sequence.append((sequence, sequence)) return paragraph_sequence
def padding(sequence, seq_length): while len(sequence) < seq_length: sequence.append("<PAD>") while len(sequence) > seq_length: sequence = sequence[:-1] return sequence
def padding(sequence): while len(sequence) < 10: sequence.append("<PAD>") while len(sequence) > 10: sequence = sequence[:-1] return sequence