示例#1
0
def refine_corpus(corpus, rule_path, output=None, thread=None):
    """
    Clean up the given corpus according to the rules defined in the files.
    This method utilizes multithreading to accelerate the process.

    Arguments:
        corpus(str): Path to the corpus file.
        rule_path(str): Path to where "parentheses.tsv" and 
            "refine_list.tsv" are.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    if output is None:
        output = corpus[:-4] + "_cleaned.txt"
    if not rule_path.endswith("/"):
        rule_path += "/"

    # Load rule files
    file_p = rule_path + "parentheses.tsv"
    file_r = rule_path + "refine_list.tsv"
    parentheses = load_rules(file_p)
    refine_list = load_rules(file_r)

    # Acquire the corpus (skip first line)
    raw_data = readlines(corpus)

    # Threading
    param = (parentheses, refine_list)
    result = generic_threading(thread, raw_data, corpus_cleanup, param)

    # Write all result to file
    write_to_file(output, result)
示例#2
0
def ibo_tagging(corpus, keywords, output=None, thread=None):
    """
    Arguments:
        corpus(str): Path to the corpus file.
        keywords(str): Path to where keywords dictionaries is.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    # output name
    if output is None:
        output = corpus[:-4] + "_ibo.tsv"

    # Load and merge dictionary
    # files = [itr for itr in os.listdir(rule) if itr.endswith("_leaf.json")]

    # Load entities
    # entity = dict()
    # for itr in files:
    #     entity.update(json.load(open(rule + itr, "r")))
    entity = json.load(open(keywords, "r"))

    # Read corpus
    raw_data = readlines(corpus)

    # Threading
    param = (entity,)
    result = generic_threading(thread, raw_data, generate_ibo, param)

    # Write result to file
    file_io(output, result)
示例#3
0
def preliminary_cleanup(corpus, rule, output=None, thread=None, limit=None):
    """
    Preliminary cleanup the corpus to make it easier for further
    processing methods. This method can be used to correct the
    missing spaces after punctuations any other customized rules
    can be added to the rule file, see punctuation_cleanup in utils
    for the formatting of the rules.

    Arguments:
        corpus(str): Path to the corpus file.
        rule(str): Path to the processing rule file.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    # output name
    if output is None:
        output = corpus[:-4] + "_preprocessed.tsv"

    # Load rules
    rules = load_rules(rule)
    # Load data
    raw_data = readlines(corpus, limit=limit, skip=True)

    # Threading
    param = (rules, "PRELIMINARY")
    result = generic_threading(thread, raw_data, punctuation_cleanup, param)

    # Write result to file
    write_to_file(output, result)
示例#4
0
def recognize_sentences(corpus,
                        keywords_path,
                        mode,
                        trim=True,
                        label=False,
                        output=None,
                        thread=None,
                        limit=None):
    """
    Arguments:
        corpus(str): Path to the corpus file.
        keywords_path(str): Path to where keywords dictionaries are.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    # output name
    if output is None:
        output = corpus[:-4] + "_sentence.tsv"

    # Decompose corpus to sentences and each as one datum
    # Load corpus (skip first line for PubMed smaller version)
    raw_data = readlines(corpus, begin=1, limit=limit)
    # Load tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    # Threading
    param = (tokenizer, )
    context = generic_threading(thread // 2, raw_data, parallel_split, param)
    context = list(chain.from_iterable(context))
    del raw_data

    print()
    print("Recognize mentions in sentences (mode: {:s})".format(mode))

    # Load all mentions
    entity = merge_dict(keywords_path, trim=trim)

    # Threading
    keywords = list(entity.keys())
    param = (keywords, mode)
    result = generic_threading(thread, context, keyword_in_sentences, param)

    # write all result to file
    write_to_file(output, result)
示例#5
0
def extract_pos(args):
    with open(args.sentences, 'r') as f, open(args.found, 'w') as fw:
        # Acquire all sentences
        raw_data = f.read().splitlines()[1:] # skip first line
        # Threading
        result = generic_threading(args.thread, raw_data, thread_pos_tagging)

        for line in tqdm(list(chain.from_iterable(result))):
            fw.write('\n'.join('{} {}'.format(x[1],x[0]) for x in line))
            fw.write('\n#########split_sentence#########\n')
        print("File saved in {:s}".format(args.found))
def parse_sentences(corpus, output=None, thread=None, limit=None):
    """
    """
    if output is None:
        output = corpus[:-4] + "_sentence.txt"

    # Load corpus
    raw_data = readlines(corpus, begin=1, limit=limit)  # skip first line

    # Load tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Threading
    param = (tokenizer, )
    result = generic_threading(thread, raw_data, parallel_parse, param)

    # Write all result to file
    write_to_file(output, result)
def run(model_dir,
        input,
        test_size,
        n_thread=20,
        tag=None,
        text_only=False,
        neg_sample=False,
        n_sample=5):
    postfix = ("_" + tag) if tag is not None else ""
    # Parse directory name
    if not model_dir.endswith("/"):
        model_dir += "/"
    # Create directory to store model
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    print("Loading dataset..")
    dataset = pd.read_csv(
        input,
        sep="\t",
        names=["label", "context", "mention", "begin", "end", "description"],
        dtype=str,
        quoting=csv.QUOTE_NONE,
        nrows=None)
    dataset["mention"] = dataset["mention"].astype(str)
    mentions = dataset["mention"].values
    labels = dataset["label"].values

    # use for spliting data with mention specific
    # np.random.shuffle(mentions)
    for idx, itr in enumerate(mentions):
        if type(itr) == float or type(itr) == int:
            print(idx, itr, dataset["label"][idx], dataset["context"][idx])
    print("{0} unique mentions...".format(len(set(mentions))))
    unique, counts = np.unique(mentions, return_counts=True)
    mention_count = dict(zip(unique, counts))

    print("Processing mention_index...")
    param = (mentions, )
    key_list = list(mention_count.keys())
    # [["mention1",[indices]],["mention2",[indices]],...]
    mention_index = generic_threading(n_thread, key_list, parallel_index,
                                      param)
    mention = []
    indices = []
    order = []

    for mention_pair_thread in mention_index:
        order.append(mention_pair_thread[0])

    order_idx = sorted(range(len(order)), key=lambda k: order[k])

    ########################################
    # TO-BE-VERIFIED CODE NECESSITY
    for thread_idx in order_idx:
        for mention_pair in mention_index[thread_idx][1:]:
            mention.append(mention_pair[0])
            indices.append(mention_pair[1])
    ########################################

    mention_index = dict(zip(mention, indices))

    total_length = mentions.shape[0]
    test_len = total_length * test_size
    train_len = total_length - 2 * test_len

    train_index = list()
    test_index = list()
    validation_index = list()

    count = 0
    print("Processing training_index...")
    ########################################
    # TO-BE REVISED TO A MORE ELEGANT SPLITTING WAY
    np.random.shuffle(unique)
    for mention in tqdm(unique):
        if count < train_len:
            count += mention_count[mention]
            train_index.append(mention_index[mention])
        elif count >= train_len and count < (train_len + test_len):
            count += mention_count[mention]
            validation_index.append(mention_index[mention])
        else:
            count += mention_count[mention]
            test_index.append(mention_index[mention])
    ########################################

    # Flatten list
    print("Flatten train/validation/test index...")
    train_index = list(itertools.chain.from_iterable(train_index))
    validation_index = list(itertools.chain.from_iterable(validation_index))
    test_index = list(itertools.chain.from_iterable(test_index))

    print("Number of instances in all sets:")
    print(" - Training   :", len(train_index))
    print(" - Testing    :", len(test_index))
    print(" - Validation :", len(validation_index))

    train_index = np.array(train_index)
    validation_index = np.array(validation_index)
    test_index = np.array(test_index)

    # shuffle the index
    np.random.shuffle(train_index)
    np.random.shuffle(validation_index)
    np.random.shuffle(test_index)

    # negative samples, kbp version only
    # Get true (positive) labels for each instance
    if neg_sample:
        pos_label = labels[train_index]
        data_size = len(pos_label)
        # Calculate the density of each labels
        distribution = Counter(pos_label)
        label_idx = []
        print("Producing distribution & label_dict...")
        for key in tqdm(distribution):
            # Normalize probabilities
            distribution[key] = distribution[key] / data_size
            # Global index (To-Be-Implemented)
            # label_idx.append(np.where(labels == key)[0][0])
            # Local index
            # TO-DO: sample all possible candidates (second index)
            label_idx.append(np.where(pos_label == key)[0][0])

        train_label = list(distribution.keys())
        train_label_prob = list(distribution.values())
        label_dict = dict(zip(list(distribution.keys()), label_idx))

        param = (
            train_label,
            train_label_prob,
            label_dict,
            n_sample,
        )
        print("Negative sampling: {}/instance".format(n_sample))
        # neg_samples = generic_threading(n_thread, pos_label, negative_sampling, param)
        # neg_samples = np.array(list(itertools.chain.from_iterable(neg_samples)))
    else:
        neg_samples = None

    filename = "{:s}pos_neg_index{:s}.pkl".format(model_dir, postfix)
    pos_neg_sample = {"positive": train_index, "negative": neg_samples}
    pkl.dump(pos_neg_sample, open(filename, "wb"))

    filename = "{:s}training_index{:s}.pkl".format(model_dir, postfix)
    pkl.dump(train_index, open(filename, "wb"))
    filename = "{:s}validation_index{:s}.pkl".format(model_dir, postfix)
    pkl.dump(validation_index, open(filename, "wb"))
    filename = "{:s}testing_index{:s}.pkl".format(model_dir, postfix)
    pkl.dump(test_index, open(filename, "wb"))

    print("Writing new_test_mention_list{:s}..".format(postfix))
    m_train = mentions[train_index]
    m_test = mentions[test_index]
    m_val = mentions[validation_index]

    print("Unique mentions in each partition: ")
    print(" - Training   : {}".format(len(set(m_train))))
    print(" - Testing    : {}".format(len(set(m_test))))
    print(" - Validation : {}".format(len(set(m_val))))

    filename = model_dir + "test_mention_list{:s}.txt".format(postfix)
    with open(filename, "w") as f:
        for mention in m_test:
            f.write(mention + "\n")