def refine_corpus(corpus, rule_path, output=None, thread=None): """ Clean up the given corpus according to the rules defined in the files. This method utilizes multithreading to accelerate the process. Arguments: corpus(str): Path to the corpus file. rule_path(str): Path to where "parentheses.tsv" and "refine_list.tsv" are. thread(int): Number of thread to process. output(str): Path to the output file. """ if output is None: output = corpus[:-4] + "_cleaned.txt" if not rule_path.endswith("/"): rule_path += "/" # Load rule files file_p = rule_path + "parentheses.tsv" file_r = rule_path + "refine_list.tsv" parentheses = load_rules(file_p) refine_list = load_rules(file_r) # Acquire the corpus (skip first line) raw_data = readlines(corpus) # Threading param = (parentheses, refine_list) result = generic_threading(thread, raw_data, corpus_cleanup, param) # Write all result to file write_to_file(output, result)
def ibo_tagging(corpus, keywords, output=None, thread=None): """ Arguments: corpus(str): Path to the corpus file. keywords(str): Path to where keywords dictionaries is. thread(int): Number of thread to process. output(str): Path to the output file. """ # output name if output is None: output = corpus[:-4] + "_ibo.tsv" # Load and merge dictionary # files = [itr for itr in os.listdir(rule) if itr.endswith("_leaf.json")] # Load entities # entity = dict() # for itr in files: # entity.update(json.load(open(rule + itr, "r"))) entity = json.load(open(keywords, "r")) # Read corpus raw_data = readlines(corpus) # Threading param = (entity,) result = generic_threading(thread, raw_data, generate_ibo, param) # Write result to file file_io(output, result)
def preliminary_cleanup(corpus, rule, output=None, thread=None, limit=None): """ Preliminary cleanup the corpus to make it easier for further processing methods. This method can be used to correct the missing spaces after punctuations any other customized rules can be added to the rule file, see punctuation_cleanup in utils for the formatting of the rules. Arguments: corpus(str): Path to the corpus file. rule(str): Path to the processing rule file. thread(int): Number of thread to process. output(str): Path to the output file. """ # output name if output is None: output = corpus[:-4] + "_preprocessed.tsv" # Load rules rules = load_rules(rule) # Load data raw_data = readlines(corpus, limit=limit, skip=True) # Threading param = (rules, "PRELIMINARY") result = generic_threading(thread, raw_data, punctuation_cleanup, param) # Write result to file write_to_file(output, result)
def recognize_sentences(corpus, keywords_path, mode, trim=True, label=False, output=None, thread=None, limit=None): """ Arguments: corpus(str): Path to the corpus file. keywords_path(str): Path to where keywords dictionaries are. thread(int): Number of thread to process. output(str): Path to the output file. """ # output name if output is None: output = corpus[:-4] + "_sentence.tsv" # Decompose corpus to sentences and each as one datum # Load corpus (skip first line for PubMed smaller version) raw_data = readlines(corpus, begin=1, limit=limit) # Load tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Threading param = (tokenizer, ) context = generic_threading(thread // 2, raw_data, parallel_split, param) context = list(chain.from_iterable(context)) del raw_data print() print("Recognize mentions in sentences (mode: {:s})".format(mode)) # Load all mentions entity = merge_dict(keywords_path, trim=trim) # Threading keywords = list(entity.keys()) param = (keywords, mode) result = generic_threading(thread, context, keyword_in_sentences, param) # write all result to file write_to_file(output, result)
def extract_pos(args): with open(args.sentences, 'r') as f, open(args.found, 'w') as fw: # Acquire all sentences raw_data = f.read().splitlines()[1:] # skip first line # Threading result = generic_threading(args.thread, raw_data, thread_pos_tagging) for line in tqdm(list(chain.from_iterable(result))): fw.write('\n'.join('{} {}'.format(x[1],x[0]) for x in line)) fw.write('\n#########split_sentence#########\n') print("File saved in {:s}".format(args.found))
def parse_sentences(corpus, output=None, thread=None, limit=None): """ """ if output is None: output = corpus[:-4] + "_sentence.txt" # Load corpus raw_data = readlines(corpus, begin=1, limit=limit) # skip first line # Load tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Threading param = (tokenizer, ) result = generic_threading(thread, raw_data, parallel_parse, param) # Write all result to file write_to_file(output, result)
def run(model_dir, input, test_size, n_thread=20, tag=None, text_only=False, neg_sample=False, n_sample=5): postfix = ("_" + tag) if tag is not None else "" # Parse directory name if not model_dir.endswith("/"): model_dir += "/" # Create directory to store model if not os.path.exists(model_dir): os.makedirs(model_dir) print("Loading dataset..") dataset = pd.read_csv( input, sep="\t", names=["label", "context", "mention", "begin", "end", "description"], dtype=str, quoting=csv.QUOTE_NONE, nrows=None) dataset["mention"] = dataset["mention"].astype(str) mentions = dataset["mention"].values labels = dataset["label"].values # use for spliting data with mention specific # np.random.shuffle(mentions) for idx, itr in enumerate(mentions): if type(itr) == float or type(itr) == int: print(idx, itr, dataset["label"][idx], dataset["context"][idx]) print("{0} unique mentions...".format(len(set(mentions)))) unique, counts = np.unique(mentions, return_counts=True) mention_count = dict(zip(unique, counts)) print("Processing mention_index...") param = (mentions, ) key_list = list(mention_count.keys()) # [["mention1",[indices]],["mention2",[indices]],...] mention_index = generic_threading(n_thread, key_list, parallel_index, param) mention = [] indices = [] order = [] for mention_pair_thread in mention_index: order.append(mention_pair_thread[0]) order_idx = sorted(range(len(order)), key=lambda k: order[k]) ######################################## # TO-BE-VERIFIED CODE NECESSITY for thread_idx in order_idx: for mention_pair in mention_index[thread_idx][1:]: mention.append(mention_pair[0]) indices.append(mention_pair[1]) ######################################## mention_index = dict(zip(mention, indices)) total_length = mentions.shape[0] test_len = total_length * test_size train_len = total_length - 2 * test_len train_index = list() test_index = list() validation_index = list() count = 0 print("Processing training_index...") ######################################## # TO-BE REVISED TO A MORE ELEGANT SPLITTING WAY np.random.shuffle(unique) for mention in tqdm(unique): if count < train_len: count += mention_count[mention] train_index.append(mention_index[mention]) elif count >= train_len and count < (train_len + test_len): count += mention_count[mention] validation_index.append(mention_index[mention]) else: count += mention_count[mention] test_index.append(mention_index[mention]) ######################################## # Flatten list print("Flatten train/validation/test index...") train_index = list(itertools.chain.from_iterable(train_index)) validation_index = list(itertools.chain.from_iterable(validation_index)) test_index = list(itertools.chain.from_iterable(test_index)) print("Number of instances in all sets:") print(" - Training :", len(train_index)) print(" - Testing :", len(test_index)) print(" - Validation :", len(validation_index)) train_index = np.array(train_index) validation_index = np.array(validation_index) test_index = np.array(test_index) # shuffle the index np.random.shuffle(train_index) np.random.shuffle(validation_index) np.random.shuffle(test_index) # negative samples, kbp version only # Get true (positive) labels for each instance if neg_sample: pos_label = labels[train_index] data_size = len(pos_label) # Calculate the density of each labels distribution = Counter(pos_label) label_idx = [] print("Producing distribution & label_dict...") for key in tqdm(distribution): # Normalize probabilities distribution[key] = distribution[key] / data_size # Global index (To-Be-Implemented) # label_idx.append(np.where(labels == key)[0][0]) # Local index # TO-DO: sample all possible candidates (second index) label_idx.append(np.where(pos_label == key)[0][0]) train_label = list(distribution.keys()) train_label_prob = list(distribution.values()) label_dict = dict(zip(list(distribution.keys()), label_idx)) param = ( train_label, train_label_prob, label_dict, n_sample, ) print("Negative sampling: {}/instance".format(n_sample)) # neg_samples = generic_threading(n_thread, pos_label, negative_sampling, param) # neg_samples = np.array(list(itertools.chain.from_iterable(neg_samples))) else: neg_samples = None filename = "{:s}pos_neg_index{:s}.pkl".format(model_dir, postfix) pos_neg_sample = {"positive": train_index, "negative": neg_samples} pkl.dump(pos_neg_sample, open(filename, "wb")) filename = "{:s}training_index{:s}.pkl".format(model_dir, postfix) pkl.dump(train_index, open(filename, "wb")) filename = "{:s}validation_index{:s}.pkl".format(model_dir, postfix) pkl.dump(validation_index, open(filename, "wb")) filename = "{:s}testing_index{:s}.pkl".format(model_dir, postfix) pkl.dump(test_index, open(filename, "wb")) print("Writing new_test_mention_list{:s}..".format(postfix)) m_train = mentions[train_index] m_test = mentions[test_index] m_val = mentions[validation_index] print("Unique mentions in each partition: ") print(" - Training : {}".format(len(set(m_train)))) print(" - Testing : {}".format(len(set(m_test)))) print(" - Validation : {}".format(len(set(m_val)))) filename = model_dir + "test_mention_list{:s}.txt".format(postfix) with open(filename, "w") as f: for mention in m_test: f.write(mention + "\n")