def generate_unicode_categories(outputfilename): raw_text_dir = read.read_from_json('raw_data_dir') unicatedict = read.read_from_json("unicatedict") data_size = len(raw_text_dir) f = h5py.File("data/" + outputfilename + ".hdf5", "w") max_len_text = read.get_char2id_dict(raw_text_dir) dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8') text_unicate_dict = dict() for data_id in range(data_size): raw_text = read.read_from_dir(raw_text_dir[data_id]) text_inputs = [[ unicatedict[unicodedata.category(char.decode("utf-8"))] for char in raw_text ]] text_unicate_dict[raw_text_dir[data_id]] = text_inputs[0] data_x = pad_sequences(text_inputs, dtype='int8', maxlen=max_len_text, padding="post") dset[data_id, :] = data_x[0] read.save_in_json("text_unicate_dict", text_unicate_dict)
def split_by_sentence(start=0, end=63): """ Split the document into sentence. (needed to build end2end system) :param start: :param end: :return: """ raw_text_dir = read.read_from_json('raw_data_dir') #### in folder data/ raw_dir_simple = read.read_from_json( 'raw_dir_simple') #### in folder data/ for data_id in range(start, end): raw_text = read.read_from_dir(raw_text_dir[data_id]) sent_tokenize_list = sent_tokenize(raw_text) sent_tokenize_span_list = spans(sent_tokenize_list, raw_text) sent_span_list = list() for sent_tokenize_span in sent_tokenize_span_list: sent_spans = list( regexp_span_tokenize(sent_tokenize_span[0], r'\n')) for sent_span in sent_spans: sent_span = (sent_span[0] + sent_tokenize_span[1], sent_span[1] + sent_tokenize_span[1]) sent_span_list.append((raw_text[sent_span[0]:sent_span[1]], sent_span[0], sent_span[1])) read.save_in_json( "training_sentence/sentences/" + raw_dir_simple[data_id], sent_span_list)
def generate_vocab_match(outputfilename): vocab_dict = get_vocab_dict() n_vocab = max(map(int, vocab_dict.keys())) - 1 #print vocab # time_terms = re.compile('|'.join(vocab), re.IGNORECASE) raw_text_dir = read.read_from_json('raw_data_dir') raw_dir_simple = read.read_from_json('raw_dir_simple') data_size = len(raw_text_dir) text_length = read.read_from_json('texts_length') f = h5py.File("data/" + outputfilename + ".hdf5", "w") max_len_text = read.get_char2id_dict(raw_text_dir) dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8') text_vocab_dict = dict() for data_id in range(data_size): raw_text = read.read_from_dir(raw_text_dir[data_id]) a = np.ones(text_length[data_id]) for index in range(n_vocab): vocab = vocab_dict[str(index + 2)] time_terms = re.compile('|'.join(vocab), re.IGNORECASE) for m in time_terms.finditer(raw_text): a[m.span()[0]:m.span()[1]] = index + 2 text_vocab_dict[raw_dir_simple[data_id]] = a.tolist() data_x = pad_sequences([a.tolist()], dtype='int8', maxlen=max_len_text, padding="post") dset[data_id, :] = data_x[0] read.save_in_json("text_vocab_dict", text_vocab_dict)
def get_vocab_dict(): data = read.read_from_dir("data/vocab/vocab2.txt") vocab_dict = dict() for line in data.splitlines(): items = line.split() if vocab_dict.has_key(items[1]): vocab_dict[items[1]].append(items[0]) else: values = [items[0]] vocab_dict[items[1]] = values return vocab_dict
def generate_pos(start=0, end=63): english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) english_postagger.java_options = '-mx4096m' raw_text_dir = read.read_from_json('raw_data_dir') data_size = len(raw_text_dir) pos = list() for data_id in range(start, end): raw_text = read.read_from_dir(raw_text_dir[data_id]) print raw_text_dir[data_id] contents = list() for line in raw_text.splitlines(): print line text = nltk.word_tokenize(line) print text if len(text) == 0: k = [] else: k = english_postagger.tag(text) index = 0 for token in k: if (text[index] != token[0]) and ( token[0] == '``' or token[0] == "''" ): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) k[index] = ["\"", "\'\'"] if token[1] not in pos: pos.append(token[1]) index += 1 contents.append(k) read.save_json("data/pos/" + raw_text_dir[data_id].rsplit('\\', 1)[1], contents) read.save_in_json("pos_tag", pos)
start = 0 end = 63 raw_text_dir = read.read_from_json('raw_data_dir') raw_dir_simple = read.read_from_json('raw_dir_simple') # data_size = len(raw_text_dir) max_len_text = read.get_char2id_dict(raw_text_dir) char2int = read.read_from_json('char2int') int2char = dict((int, char) for char, int in char2int.items()) text_pos_text_dict = dict() for data_id in range(start, end): print raw_dir_simple[data_id] pos = read.read_json("data/pos/" + raw_dir_simple[data_id]) raw_text = read.read_from_dir(raw_text_dir[data_id]) text_inputs = [[char2int[char] for char in raw_text]] postag = list() index = 0 for line in raw_text.splitlines(): if len(line) == 0: postag.append('\n') index += 1 else: token_index = 0 term = "" for char in line: # if term =="leade": # print "ok" if char == ' ':