def load_train_data(args, train_dir, valid_prop=0.10): """load training data and write to IO formatted training and validation files""" vocab = set() tfile = codecs.open(join(args.work_dir, TRAIN_FILE_NAME), 'w', 'utf-8') vfile = codecs.open(join(args.work_dir, VALID_FILE_NAME), 'w', 'utf-8') txt_files = [f for f in listdir(train_dir) if f.endswith(".txt")] random.shuffle(txt_files) num_val_files = int(len(txt_files) * valid_prop) for findex, txt_file in enumerate(txt_files): print("Reading", txt_file) rfile = vfile if findex < num_val_files else tfile doc_tokens, file_vocab = tokenize_document(join(train_dir, txt_file)) vocab = vocab.union(file_vocab) annotations = read_annotations(join(train_dir, txt_file[:-3] + "ann")) for token in doc_tokens: ignore_token = False for ann in annotations: if token.start >= ann.start and token.end <= ann.end: # Change this for IOB annotations if ann.atype == LOC_ANN_TAG: token.encoding = "I-LOC" if ann.atype == PRO_ANN_TAG: ignore_token = True break if not ignore_token: print(token.text + "\t" + token.encoding, file=rfile) tfile.close() vfile.close() return vocab
def get_input_pmc(word_emb_model, input_file): '''loads files for annotation''' window_size = 5 # By default n_neighbors = int(window_size / 2) doc_tokens, _ = tokenize_document(input_file) # print("processing file: {} and neighbors = {}".format(input_file, n_neighbors)) padding = "<s>" words = [] for _ in range(n_neighbors): words.append(padding) for token in doc_tokens: words.append(token.text) for _ in range(n_neighbors): words.append(padding) instances = [] for i in range(n_neighbors, len(words) - n_neighbors): context = [] for j in range(-n_neighbors, n_neighbors + 1): context = np.append(context, word_emb_model[words[i + j]]) instances.append(context) assert len(doc_tokens) == len(instances) return doc_tokens, instances
def load_test_data(args, test_dir): """load test data and write to IO formatted file""" vocab = set() tfile = codecs.open(join(args.work_dir, "test-io.txt"), 'w', 'utf-8') txt_files = [f for f in listdir(test_dir) if f.endswith(".txt")] for _, txt_file in enumerate(txt_files): print("Reading", txt_file) doc_tokens, file_vocab = tokenize_document(join(test_dir, txt_file)) vocab = vocab.union(file_vocab) annotations = read_annotations(join(test_dir, txt_file[:-3] + "ann")) for token in doc_tokens: ignore_token = False for ann in annotations: if token.start >= ann.start and token.end <= ann.end: # Change this for IOB annotations if ann.atype == LOC_ANN_TAG: token.encoding = "I-LOC" if ann.atype == PRO_ANN_TAG: ignore_token = True break if not ignore_token: print(token.text + "\t" + token.encoding, file=tfile) tfile.close() return vocab
import nltk import pickle from utils import tokenize_document resume_file = open('../assets/resume.txt', 'r') resume = resume_file.read() resume_file.close() tokenizer = nltk.RegexpTokenizer(r'\w+') resume_tokenized = tokenize_document(resume, tokenizer) print(resume_tokenized) pickle.dump(resume_tokenized, open('../assets/resume_tokens.p', 'wb'))
tokens_to_generate = config['predict']['tokens_to_generate'] model_path = str(u.get_model_path()) # Load the model weights --------------------------------------------------------- model = cm.create_model() model.load_weights(model_path) # Setup `dict` to un-vectorize --------------------------------------------------- int_to_token = dict((x[1], x[0]) for x in token_to_int.items()) # Seed the prediction process ---------------------------------------------------- # pin our seed for reproduceability np.random.seed(0) for file in data_path.iterdir(): if file.suffix == '.txt': tokens = u.tokenize_document(file) vector = u.vectorize(tokens, token_to_int) indx = np.random.randint(len(tokens) - sequence_length) seed = [int(x) for x in vector[indx:(indx + sequence_length)]] break print(f'Seed: {u.vec_to_str(seed, int_to_token)}') # Generation -------------------------------------------------------------------- for diversity in [1, 1.33, 1.66, 2]: np.random.seed(0) curr = seed generated_tokens = [] for i in range(tokens_to_generate): one_hot_x = u.one_hot_single(curr, token_count) one_hot_y = model.predict(one_hot_x) y = u.un_one_hot(one_hot_y, diversity)
import pickle from utils import tokenize_document, tokenizer from collections import Counter tokens_with_stopwords = tokenize_document(open('../assets/resume.txt', 'r').read(), tokenizer, remove_stopwords=False) print('Number of Tokens: (with stopwords)', len(tokens_with_stopwords)) print('Number of Unique tokens: (with stopwords)', len(set(tokens_with_stopwords)), '\n') tokens = pickle.load(open('../assets/resume_tokens.p', 'rb')) print('Number of Tokens: (without stopwords)', len(tokens)) tokens_set = set(tokens) print('Number of Unique tokens: (without stopwords)', len(tokens_set)) print('\nPercentage Reduction in tokens after removing stopwords:', (len(set(tokens_with_stopwords)) - len(tokens_set)) / len(tokens_set) * 100) frequencies = Counter(tokens) print( '\nThe Frequencies of the most common 10 tokens are: (in tokens without stopwords)\n', frequencies.most_common(10))