Exemplo n.º 1
0
def main():
    # Load in the data
    prompts = np.asarray(preprocessor.read_prompts())
    annotations = np.asarray(preprocessor.read_annotations())
    data = {}
    preds, y_test = [], []

    
    # store data in dictionary
    for p in prompts:
        data[p[0]] = {'xml': p[1], 'outcome': p[2], 'intervention': p[3], 'comparator': p[4], 'answer': '', 'reasoning': ''}
        
    for a in annotations:
        if (a[3]):
            data[a[1]]['answer'] = a[7]
        if (a[4]):
            data[a[1]]['reasoning'] += str(a[6])
       
    test_id = preprocessor.test_document_ids()
    # get predictions and add them to array
    for k in data.keys():
        # try to parse text to remove weird things
        id_   = data[k]['xml']
        
        # if the file is not a test file
        if not(id_ in test_id):
            continue 
        
        out   = try_except_parse(data[k]['outcome'])
        inter = try_except_parse(data[k]['intervention'])
        cmp   = try_except_parse(data[k]['comparator'])
        ans   = try_except_parse(data[k]['answer'])
        res   = try_except_parse(data[k]['reasoning'])
        
        if (ans == ''):
            continue # we don't have a valid answer for this one... 
            
        y_test.append(ans)

        # just use the reasoning as our sentence        
        likely_sentence = res
        guess = eval_sentence(likely_sentence, out, inter, cmp)
        
        if (guess == "No significant difference"):
            preds.append(0)
        elif (guess == "Significantly decreased"):
            preds.append(-1)
        else:
            preds.append(1)
         
    acc  = accuracy_score(y_test, preds)
    f1   = f1_score(y_test, preds, average='macro')
    prec = precision_score(y_test, preds, average = 'macro')
    rec  = recall_score(y_test, preds, average = 'macro')
        
    return acc, f1, prec, rec
def load_data(use_test, bow=True):
    """
    Load the data into a train/val/test set that allows for easy access.

    @return bag-of-word representation of training, validation, test sets (with labels).
    """

    prompts = preprocessor.read_prompts()
    annotations = preprocessor.read_annotations()

    # filter out prompts for which we do not have annotations for whatever reason
    # this was actually just one case; not sure what was going on there.
    def have_annotations_for_prompt(prompt_id):
        return len(
            annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0

    prompts = [
        prompt for row_idx, prompt in prompts.iterrows()
        if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME])
    ]
    prompts = pd.DataFrame(prompts)

    # Sort into training and validation by article id
    train_doc_ids = preprocessor.train_document_ids()
    val_doc_ids = preprocessor.validation_document_ids()
    test_doc_ids = preprocessor.test_document_ids()

    # get a dev set randomly
    dev_doc_ids = list(train_doc_ids)
    random.shuffle(dev_doc_ids)
    dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)])

    x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], []
    pids = prompts[STUDY_ID_COL].values
    for i in range(len(pids)):
        annotations_for_prompt = annotations[annotations[PROMPT_ID_COL_NAME] ==
                                             prompts["PromptID"].values[i]]
        labels = annotations_for_prompt[[LBL_COL_NAME,
                                         EVIDENCE_COL_NAME]].values
        id_ = pids[i]

        # this is all of the reasonings
        articles = [a[1] for a in labels]

        for article_text in articles:
            # extract i/c/o
            out = prompts["Outcome"].values[i].lower()
            inter = prompts["Intervention"].values[i].lower()
            cmp = prompts["Comparator"].values[i].lower()

            # add to correct pile: train/val/test
            tmp = [article_text, out, inter, cmp]
            loss = stats.mode([l1[0] for l1 in labels])[0][0]

            if id_ in dev_doc_ids and not (use_test):
                x_dev.append(tmp)
                y_dev.append(loss)
            elif id_ in train_doc_ids:
                x_train.append(tmp)
                y_train.append(loss)
            elif id_ in val_doc_ids:
                x_val.append(tmp)
                y_val.append(loss)
            elif id_ in test_doc_ids:
                x_test.append(tmp)
                y_test.append(loss)
            else:
                raise ValueError("Unknown study id {}".format(id_))

    # transform to np.array
    y_test = np.asarray(y_test)

    # if we are removing the test set, use validation as test set.
    if not (use_test):
        x_test = x_val
        y_test = y_val
        x_val = x_dev
        y_val = y_dev

    print("Running bag of words...")
    ret = bag_of_words(
        x_train, y_train, x_val, y_val, x_test,
        y_test) if bow else [x_train, y_train, x_val, y_val, x_test, y_test]
    return ret
def load_data(use_test, bow=True):
    """
    Load the data into a train/val/test set that allows for easy access.

    @return bag-of-word representation of training, validation, test sets (with labels).
    """

    prompts = preprocessor.read_prompts()
    annotations = preprocessor.read_annotations()

    # filter out prompts for which we do not have annotations for whatever reason
    # this was actually just one case; not sure what was going on there.
    def have_annotations_for_prompt(prompt_id):
        return len(
            annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0

    prompts = [
        prompt for row_idx, prompt in prompts.iterrows()
        if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME])
    ]
    prompts = pd.DataFrame(prompts)

    # Sort into training and validation by article id
    train_doc_ids = preprocessor.train_document_ids()
    val_doc_ids = preprocessor.validation_document_ids()
    test_doc_ids = preprocessor.test_document_ids()

    # get a dev set randomly
    dev_doc_ids = list(train_doc_ids)
    random.shuffle(dev_doc_ids)
    dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)])

    x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], []
    pids = prompts[STUDY_ID_COL].values
    for i in range(len(pids)):
        id_, data, losses = parse_prompt_id_data(annotations, prompts, pids, i)

        in_training = id_ in (train_doc_ids - dev_doc_ids)
        # get a reasoning from previous/next prompt id
        if (i > 0 and id_ == pids[i - 1] and not (in_training)):
            _, mismatched_data, _ = parse_prompt_id_data(
                annotations, prompts, pids, i - 1)
            # add the mismatched data here
            row = copy.deepcopy(data[0])
            row[1] = mismatched_data[0][1]
            data.append(row)
            losses.append(losses[-1])

        elif (i < len(pids) and id_ == pids[i + 1] and not (in_training)):
            _, mismatched_data, _ = parse_prompt_id_data(
                annotations, prompts, pids, i + 1)
            # add the mismatched data here
            row = copy.deepcopy(data[0])
            row[1] = mismatched_data[0][1]
            data.append(row)
            losses.append(losses[-1])

        for i in range(len(data)):
            tmp = data[i]
            loss = losses[i]

            # find where to put this section
            if id_ in dev_doc_ids and not (use_test):
                x_dev.append(tmp)
                y_dev.append(loss)
            elif id_ in train_doc_ids:
                x_train.append(tmp)
                y_train.append(loss)
            elif id_ in val_doc_ids:
                x_val.append(tmp)
                y_val.append(loss)
            elif id_ in test_doc_ids:
                x_test.append(tmp)
                y_test.append(loss)
            else:
                raise ValueError("Unknown study id {}".format(id_))

    # if we are removing the test set, use validation as test set.
    if not (use_test):
        x_test = x_val
        y_test = y_val
        x_val = x_dev
        y_val = y_dev

    ret = bag_of_words(
        x_train, y_train, x_val, y_val, x_test, y_test,
        5) if bow else [x_train, y_train, x_val, y_val, x_test, y_test]
    return ret
    min_span = ''
    for i_offset in range(-search_range, search_range):
        for f_offset in range(-search_range, search_range):
            span = text[i + i_offset:f + f_offset]
            dist = string_distance(ev.strip(' '), span.strip(' '))
            if dist <= min_dist:
                min_dist = dist
                min_span = span

    if min_span:
        return True

    return False


annotations = pp.read_annotations()

counter = 0
almost = 0
exact = 0
total = len(annotations)
for _, annot in annotations.iterrows():
    article_file = path.join(DATA_DIR,
                             "txt_files/PMC" + str(annot.PMCID) + ".txt")
    with open(article_file, encoding='utf-8') as f:
        text = f.read()

    start, end = annot["Evidence Start"], annot["Evidence End"]

    raw_text = text[start:end + 1]
    saved_text = pp.extract_raw_text(pp.get_article(annot.PMCID))[start:end +