예제 #1
0
def main(data_folder_path):
    scispacy_parser = scispacy_util.SciSpaCyParser()
    train_path = os.path.join(data_folder_path, "train")
    dev_path = os.path.join(data_folder_path, "dev")
    test_path = os.path.join(data_folder_path, "test")

    # parse train set
    conll_parser = ConllParser(train_path, scispacy_parser)
    conll_parser.parse_text_files_to_conll_format()

    # publication_text_path = os.path.join(conll_parser.text_files_path, "2FRKK6MN.pdf.txt")
    # with open(publication_text_path) as publication_text_file:
    #     full_text = publication_text_file.read()
    # doc = conll_parser.scispacy_parser.scispacy_create_doc(full_text)
    # conll_parser.build_publication_to_datasets_and_mentions()
    # # print(conll_parser.publication_to_datasets_and_mentions['2FRKK6MN'])
    #
    # m = conll_parser.build_match_index_to_tag(doc, conll_parser.publication_to_datasets_and_mentions['2FRKK6MN'], '2FRKK6MN')
    # print(m)
    # exit()

    conll_parser.parse_text_files_to_conll_format()

    # parse dev set
    conll_parser = ConllParser(dev_path, scispacy_parser)
    conll_parser.parse_text_files_to_conll_format()

    # parse test set
    conll_parser = ConllParser(test_path, scispacy_parser)
    conll_parser.parse_text_files_to_conll_format()
    def __init__(self, train_path, dev_path, kb_path, test_path = None):
        # path to the data folder for the train set
        self.train_path = train_path

        # path to the data folder for the dev set
        self.dev_path = dev_path

        # path to the json kb file
        self.kb_path = kb_path

        # optional path to the data folder for the test set
        # if this argument is passed in, the model will use mentions from the dev
        # and train set to make predictions on the test set.
        # Otherwise it will use mentions from the train set to make predictions on the
        # test set
        self.test_path = test_path

        # set of unique mentions in the dev set
        self._dev_set_mentions = set()
        self._build_dev_set_mentions()

        # set of unique mentions in the train set
        self._train_set_mentions = set()
        self._build_train_set_mentions()

        # set of unique mentions in the entire kb
        self._all_mentions = set()
        self._build_all_mentions()

        # dictionary mapping dataset id to a set of mentions of that dataset
        self._id_to_mentions = {}
        self._build_id_to_mentions()

        # set of english stopwords
        self._stopwords =  set(stopwords.words('english'))

        # an instance of a scispacy parser
        self._scispacy_parser = scispacy_util.SciSpaCyParser()

        # dictionary mapping mention to the number of datasets it is a mention for
        self._mention_dataset_count = {}

        # the total number of datasets
        self._dataset_count = 0
        self._build_mention_dataset_count()

        # precompile mention regexes
        self._dataset_id_to_regexes = {}
        for dataset_id in self._id_to_mentions:
            compiled_res = []
            for mention in self._id_to_mentions[dataset_id]:
                mention_patterns = self._build_mention_patterns(mention)
                for pattern in mention_patterns:
                    compiled_re = re.compile(pattern)
                    compiled_res.append(compiled_re)
            self._dataset_id_to_regexes[dataset_id] = compiled_res
예제 #3
0
def main(data_folder_path):
    scispacy_parser = scispacy_util.SciSpaCyParser()
    train_path = os.path.join(data_folder_path, "train")
    dev_path = os.path.join(data_folder_path, "dev")
    test_path = os.path.join(data_folder_path, "test")

    # parse train set
    conll_parser = ConllParser(train_path, scispacy_parser)
    conll_parser.parse_text_files_to_conll_format()

    # parse dev set
    conll_parser = ConllParser(dev_path, scispacy_parser)
    conll_parser.parse_text_files_to_conll_format()

    # parse test set
    conll_parser = ConllParser(test_path, scispacy_parser)
    conll_parser.parse_text_files_to_conll_format()
    def __init__(self, train_path, dev_path, sage_methods_path,
                 leipzig_word_counts_path):
        # path to the data folder for the train set
        self.train_path = train_path

        # path to the data folder for the dev set
        self.dev_path = dev_path

        # read the list of sage methods and prepare a regex to match them.
        sage_method_entries = json.load(open(sage_methods_path,
                                             mode='rt'))["@graph"]
        method_names = []
        for entry in sage_method_entries:
            if "skos:prefLabel" in entry:
                method_names.append(entry["skos:prefLabel"]["@value"])
            if "skos:altLabel" in entry:
                if type(entry["skos:altLabel"]) == list:
                    for label in entry["skos:altLabel"]:
                        method_names.append(label["@value"])
                else:
                    method_names.append(entry["skos:altLabel"]["@value"])
        # lowercase and remove duplicates.
        method_names = [
            name for name in set([name.lower() for name in method_names])
        ]
        # remove very short names.
        method_regexes = [
            re.escape(method_name) for method_name in method_names
        ]
        methods_regex_string = r'\b(?P<method_name>' + '|'.join(
            method_regexes) + r')\b'
        # to debug the regex: print(methods_regex_string)
        self.sage_methods_regex = re.compile(methods_regex_string,
                                             re.IGNORECASE)

        # set of english stopwords
        self._stopwords = set(stopwords.words('english'))

        # an instance of a scispacy parser
        self._scispacy_parser = scispacy_util.SciSpaCyParser()

        # read word counts in the Leipzig corpus.
        self._read_leipzig_word_counts_file(leipzig_word_counts_path)
예제 #5
0
def generate_citations_from_ner_mentions(
        ner_mentions: List[Dict[str, Union[int, str, float]]], kb_path: str):
    """Generate candidate citations for the mentions produced by the ner model by using TFIDF
       weighted overlap with dataset titles

       @param ner_mentions: list of the ner_mentions
       @param kb_path: path to the knowledge base of datasets
    """
    nltk_stopwords = set(stopwords.words('english'))
    scispacy_parser = scispacy_util.SciSpaCyParser()
    substring_matches = set()
    tfidf_vectorizer = text_utils.get_tfidf_vectorizer()

    with open(kb_path) as kb_file_:
        kb = json.load(kb_file_)

    dataset_titles = []
    tokenized_dataset_titles = []
    dataset_ids = []
    dataset_id_to_title = {}
    for dataset in tqdm(kb, desc="processing kb"):
        dataset_title = text_utils.text_preprocess(dataset["title"])
        dataset_id = dataset["data_set_id"]
        dataset_titles.append(dataset_title)
        tokenized_dataset_titles.append(dataset_title.split(" "))
        dataset_ids.append(dataset_id)
        dataset_id_to_title[dataset_id] = dataset_title.split(" ")

    output_citations = []
    num_candidates = []
    i = 0
    mention_citations = []
    for mention in tqdm(ner_mentions,
                        desc="Generating candidates from ner mentions"):
        publication_id = mention["publication_id"]
        mention_text = mention["mention"]
        instance = mention["instance"]

        if len(instance) - len(mention_text.split()) < 5:
            continue

        if len(mention_text.split()) == 1 and not mention_text.isupper():
            continue

        parsed_sentence = scispacy_parser.scispacy_create_doc(
            ' '.join(instance))
        pos_counts = defaultdict(int)
        for t in parsed_sentence:
            pos_counts[t.pos_] += 1

        if pos_counts["NOUN"] + pos_counts["VERB"] == 0:
            continue

        if (pos_counts["NUM"] + pos_counts["SYM"] + pos_counts["PUNCT"]
            ) > 0.4 * len(parsed_sentence) and pos_counts["VERB"] == 0:
            continue

        mention_citations.append({
            "publication_id": publication_id,
            "mention": mention_text,
            "score": mention["score"]
        })

        mention_text = text_utils.text_preprocess(mention_text)
        dataset_candidates = text_utils.get_substring_candidates(
            dataset_ids, dataset_titles, tokenized_dataset_titles,
            mention_text, instance, nltk_stopwords, scispacy_parser,
            tfidf_vectorizer)
        num_candidates.append(0)

        sorted_candidates = []
        for dataset_id, match_count in zip(dataset_candidates[0],
                                           dataset_candidates[1]):
            sorted_candidates.append((dataset_id, match_count))

        sorted_candidates = sorted(sorted_candidates,
                                   key=lambda x: x[1],
                                   reverse=True)

        filtered_candidates = []
        for candidate in sorted_candidates:
            score = candidate[1]
            if score > 0.0:
                filtered_candidates.append((candidate[0], score))

        for top_candidate in range(0, min(30, len(filtered_candidates))):
            if sorted_candidates != []:
                num_candidates[i] += 1
                output_dict = {}
                output_dict["publication_id"] = publication_id
                output_dict["data_set_id"] = sorted_candidates[top_candidate][
                    0]
                output_dict["score"] = sorted_candidates[top_candidate][1]
                output_dict["mention_list"] = [mention["mention"]]
                output_citations.append(output_dict)
        i += 1

    print("Num mentions:", len(num_candidates))
    print("Average candidates per mention:", np.mean(num_candidates))
    print("Min, median, max candidates per mention:", np.min(num_candidates),
          np.median(num_candidates), np.max(num_candidates))
    print("unique:", sum(np.unique(num_candidates, return_counts=True)[1]))
    return output_citations, mention_citations
예제 #6
0
from s2base import scispacy_util
from tqdm import tqdm
import re
from create_sgtb_dataset import get_scispacy_doc
import logging
#import nltk
#nltk.download('stopwords')

logging.basicConfig(level=logging.ERROR)

# the path to the test publications.json
PUB_PATH = os.path.abspath(os.path.join("data", "test", "publications.json"))
# the path to the test text files
TEXT_FILES_PATH = os.path.abspath(os.path.join("data", "test"))
# an instance of SciSpaCyParser
SCISPACY_PARSER = scispacy_util.SciSpaCyParser()


def create_conll_line(token):
    """Create one line of the output conll file

       @param token: the token for the line being created
       @param match_index_to_tag: the dictionary mapping token index to entity tag
    """
    word = token.text
    pos = token.pos_
    tag = "O"
    linking_tag = "_"
    entity_tag = "O"

    output_line = word + " " + pos + " " + tag + " " + entity_tag
예제 #7
0
def create_dataset_input(rule_based_candidates,
                         mention_context_cache_path,
                         data_folder_path,
                         overall_output_path=None,
                         is_test=False,
                         output_path=None,
                         overwrite_dataset=False):
    """Function to take in the rule based candidates and create
       the input format for the SGTB model. This function is intended
       to be used for processing test data, as the main function in
       this file will convert and save train, dev, and test output.

       @param rule_based_candidates: a list of candidates from the rule based model
       @param mention_context_cache_path: path to a dictionary mapping <pub_id>:<mention_text> pairs to all contexts
       @param data_folder_path: path to the data folder
       @param overall_output_path: path to the overall output folder (optional, used for SGTB training)
       @param is_test: parameter indicating whether or not the data being processed is test data
       @param output_path: the path to write the output to (if not processing test data)
       @param overwrite_dataset: whether or not to overwrite the existing dataset (will be true for train
                                 and false for dev and test)
    """
    scispacy_parser = scispacy_util.SciSpaCyParser()
    prior_entity_probs = compute_entity_probabilities()
    prior_entity_given_mention_probs = compute_entity_given_mention_probs()
    prior_mention_given_entity_probs = compute_mention_given_entity_probs()

    glove_path = os.path.abspath(
        os.path.join("project", "data", "glove", "glove.6B.50d.txt"))
    with open(glove_path, "r") as lines:
        glove = {
            line.split()[0]:
            np.array([float(value) for value in line.split()[1:]])
            for line in lines
        }

    # I haven't run the experiments to tell if having a cache actually helps or not, it takes a while to load
    # the cache when it is used
    # if is_test:
    #     mention_context_cache = {}
    # else:
    #     try:
    #         print("Loading cache...")
    #         mention_context_cache = joblib.load(mention_context_cache_path)["cache"]
    #         print("Cache loaded...")
    #     except:
    #         mention_context_cache = {}
    mention_context_cache = {}

    kb_path = os.path.abspath(os.path.join("project", "data",
                                           "data_sets.json"))
    with open(kb_path) as kb_file:
        kb_json = json.load(kb_file)

    dataset_id_to_kb_entry = {}
    for dataset in kb_json:
        dataset_id_to_kb_entry[dataset["data_set_id"]] = dataset

    matcher = Matcher(scispacy_parser.nlp.vocab)
    section_matcher = Matcher(scispacy_parser.nlp.vocab)
    for section_name in SECTION_STRINGS:
        section_matcher.add(section_name, None, [{
            "LOWER": section_name
        }, {
            "ORTH": "\n"
        }], [{
            "LOWER": section_name
        }, {
            "ORTH": ":"
        }], [{
            "ORTH": "\n"
        }, {
            "LOWER": section_name
        }, {
            "ORTH": "."
        }])

    output_docs = []
    pub_ids = []
    # we will write a new file on the first document, and amend to it afterwards
    first_doc = True
    cache_changed = False
    for pub_id in tqdm(rule_based_candidates,
                       desc='create dataset in create_sgtb_dataset.py'):
        spacy_doc = get_scispacy_doc(data_folder_path, pub_id, scispacy_parser)

        pub_ids.append(pub_id)
        doc_candidates = rule_based_candidates[pub_id]
        output_doc = []

        dataset_id_to_longest_mention_text = {}
        for row in doc_candidates:
            mention_text = row["mention"]
            dataset_id = row["candidate_dataset_ids"][0]
            if dataset_id in dataset_id_to_longest_mention_text:
                if len(mention_text) > len(
                        dataset_id_to_longest_mention_text[dataset_id]):
                    dataset_id_to_longest_mention_text[
                        dataset_id] = mention_text
            else:
                dataset_id_to_longest_mention_text[dataset_id] = mention_text

        for row in doc_candidates:
            mention_text = row["mention"]
            dataset_id = row["candidate_dataset_ids"][0]
            # if mention_text != dataset_id_to_longest_mention_text[dataset_id]:
            #     continue

            mention_context_cache_key = str(pub_id) + "_" + mention_text
            if mention_context_cache_key in mention_context_cache:
                mention_contexts = mention_context_cache[
                    mention_context_cache_key]
            else:
                # search for the mention text in the doc
                spacy_mention_text = scispacy_parser.scispacy_create_doc(
                    mention_text)

                pattern = []
                for token in spacy_mention_text:
                    pattern.append({"ORTH": token.text})
                try:
                    matcher.add("MENTION", None, pattern)
                    matches = list(matcher(spacy_doc))
                except ValueError:
                    continue

                # build and save a mapping of <pub_id>_<mention_text> to all contexts the mention
                # is found in
                cache_changed = True
                mention_contexts = []
                token_idx_to_sent_idx = {}
                sentences_list = list(spacy_doc.sents)
                context_size = 3
                for sent_idx, sent in enumerate(sentences_list):
                    for token in sent:
                        token_idx = token.i
                        token_idx_to_sent_idx[token_idx] = sent_idx

                for match_id, start, end in matches:
                    sentence_idx = token_idx_to_sent_idx[start]
                    start_context_sent_idx = max(0,
                                                 sentence_idx - context_size)
                    if start_context_sent_idx == 0:
                        match_sentence_idx = sentence_idx
                    else:
                        match_sentence_idx = context_size
                    end_context_sent_idx = min(len(sentences_list),
                                               sentence_idx + context_size)
                    mention_context = sentences_list[
                        start_context_sent_idx:end_context_sent_idx + 1]
                    sentences_as_docs = []
                    for sentence in mention_context:
                        sentences_as_docs.append(sentence.as_doc())

                    start_context_token_idx = sentences_list[
                        start_context_sent_idx].start
                    end_context_token_idx = sentences_list[end_context_sent_idx
                                                           - 1].end
                    context_with_offsets = (sentences_as_docs,
                                            (start_context_token_idx,
                                             end_context_token_idx),
                                            (start, end), match_sentence_idx)
                    mention_contexts.append(context_with_offsets)

                # limit featurizing to first 3 contexts in order of appearance
                mention_contexts = mention_contexts[:3]
                mention_context_cache[
                    mention_context_cache_key] = mention_contexts

                matcher.remove("MENTION")

            if mention_contexts != []:
                output_mention = create_output_mention(
                    is_test, row, prior_entity_probs,
                    prior_entity_given_mention_probs, mention_text,
                    prior_mention_given_entity_probs, dataset_id_to_kb_entry,
                    mention_contexts, scispacy_parser, glove, spacy_doc,
                    section_matcher)
                output_doc.append(output_mention)

        # only write output to file if not processing test data
        if not is_test:
            if first_doc:
                with open(output_path, "w") as output_file:
                    json.dump(output_doc, output_file)
                    output_file.write("\n")
                first_doc = False

                if overwrite_dataset:
                    with open(overall_output_path, "w") as overall_output_file:
                        json.dump(output_doc, overall_output_file)
                        overall_output_file.write("\n")
            else:
                with open(output_path, "a") as output_file:
                    json.dump(output_doc, output_file)
                    output_file.write("\n")

                with open(overall_output_path, "a") as overall_output_file:
                    json.dump(output_doc, overall_output_file)
                    overall_output_file.write("\n")

        output_docs.append(json.loads(json.dumps(output_doc)))

    # if cache_changed and not is_test:
    #     joblib.dump({"cache": mention_context_cache}, mention_context_cache_path)
    return output_docs, pub_ids