示例#1
0
    def __init__(self,
                 db: FeverDocDB,
                 wiki_tokenizer: Tokenizer = None,
                 claim_tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        self._wiki_tokenizer = wiki_tokenizer or WordTokenizer()
        self._claim_tokenizer = claim_tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

        self.db = db

        self.formatter = FEVERSentenceFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema())
        self.reader = JSONLineReader()
示例#2
0
class FEVERSentenceRelatednessFormatter(FeverFormatter):
    def __init__(self, idx, db, ls):
        super().__init__(idx, ls)
        self.label_schema = ls
        self.ols = FEVERLabelSchema()
        self.db = db

    def format_line(self, line):
        annotation = line["label"]
        if annotation is None:
            annotation = line["verifiable"]

        if self.ols.get_id(annotation) != self.ols.get_id("not enough info"):
            annotation = "related"
        else:
            annotation = "unrelated"

        evidence_texts = []
        claim = self.tokenize(line['claim']).strip()
        for page in set([ev[2] for ev in line['evidence']]):
            evidences = set(
                [ev[3] for ev in line['evidence'] if ev[1] == page])
            lines = self.db.get_doc_lines(page)
            if any(ev < 0 for ev in evidences):
                evidence_texts = [""]
            else:
                evidence_texts = [
                    lines.split("\n")[line].split("\t")[1].split()
                    for line in evidences
                ]

        return {
            "claim": claim,
            "sentences": evidence_texts,
            "label": self.label_schema.get_id(annotation),
            "label_text": annotation
        }
示例#3
0
    def __init__(self,
                 db: FeverDocDB,
                 sentence_level=False,
                 wiki_tokenizer: Tokenizer = None,
                 claim_tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 ner_facts=False,
                 filtering: str = None) -> None:
        self._sentence_level = sentence_level
        self._ner_facts = ner_facts
        self._wiki_tokenizer = wiki_tokenizer or WordTokenizer()
        self._claim_tokenizer = claim_tokenizer or WordTokenizer()

        self._token_indexers = token_indexers or {
            'elmo': ELMoTokenCharactersIndexer(),
            'tokens': SingleIdTokenIndexer()
        }

        self.db = db

        self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()),
                                            FEVERLabelSchema(),
                                            filtering=filtering)
        self.reader = JSONLineReader()
示例#4
0
    mname = args.model
    logger.info("Model name is {0}".format(mname))

    ffns = []

    if args.sentence:
        logger.info("Model is Sentence level")
        ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname))
    else:
        logger.info("Model is Document level")
        ffns.append(TermFrequencyFeatureFunction(db,naming=mname))

    f = Features(mname,ffns)
    jlr = JSONLineReader()

    formatter = FEVERGoldFormatter(None, FEVERLabelSchema(),filtering=args.filtering)

    train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter)
    dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter)

    train_ds.read()
    dev_ds.read()

    test_ds = None
    if args.test is not None:
        test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter)
        test_ds.read()

    train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds)
    f.save_vocab(mname)
                        help=("String option specifying tokenizer type to use "
                              "(e.g. 'corenlp')"))

    parser.add_argument('--num-workers',
                        type=int,
                        default=None,
                        help='Number of CPU processes (for tokenizing, etc)')
    args = parser.parse_args()
    doc_freqs = None
    if args.use_precomputed:
        _, metadata = utils.load_sparse_csr(args.model)
        doc_freqs = metadata['doc_freqs'].squeeze()

    db = FeverDocDB("data/fever/fever.db")
    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(set(), FEVERLabelSchema())

    jlr = JSONLineReader()

    with open(args.in_file, "r") as f, open(
            "data/fever/{0}.sentences.{3}.p{1}.s{2}.jsonl".format(
                args.split, args.max_page, args.max_sent,
                "precomputed" if args.use_precomputed else "not_precomputed"),
            "w+") as out_file:
        lines = jlr.process(f)
        #lines = tf_idf_claims_batch(lines)

        for line in tqdm(lines):
            line = tf_idf_claim(line)
            out_file.write(json.dumps(line) + "\n")
示例#6
0
 def __init__(self,
              db: FeverDocDB) -> None:
     self.db = db
     self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema())
     self.reader = JSONLineReader()
示例#7
0
    logger.info("Model name is {0}".format(mname))

    ffns = []

    if args.sentence:
        logger.info("Model is Sentence level")
        ffns.append(SentenceLevelTermFrequencyFeatureFunction(db,
                                                              naming=mname))
    else:
        logger.info("Model is Document level")
        ffns.append(TermFrequencyFeatureFunction(db, naming=mname))

    f = Features(mname, ffns)
    f.load_vocab(mname)

    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(None, FEVERLabelSchema())

    test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter)
    test_ds.read()
    feats = f.lookup(test_ds)

    input_shape = feats[0].shape[1]
    model = SimpleMLP(input_shape, 100, 3)

    if gpu():
        model.cuda()

    model.load_state_dict(torch.load("models/{0}.model".format(mname)))
    print_evaluation(model, feats, FEVERLabelSchema(), args.log)
示例#8
0
 def __init__(self, idx, db, ls):
     super().__init__(idx, ls)
     self.label_schema = ls
     self.ols = FEVERLabelSchema()
     self.db = db
示例#9
0
    ffns = []

    if args.sentence:
        logger.info("Model is Sentence level")
        ffns.append(SentenceLevelTermFrequencyFeatureFunction(db,
                                                              naming=mname))
    else:
        logger.info("Model is Document level")
        ffns.append(TermFrequencyFeatureFunction(db, naming=mname))

    f = Features(mname, ffns)
    jlr = JSONLineReader()

    formatter = FEVERGoldFormatter(None,
                                   FEVERLabelSchema(),
                                   filtering=args.filtering)

    train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter)
    dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter)

    train_ds.read()
    dev_ds.read()

    test_ds = None
    if args.test is not None:
        test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter)
        test_ds.read()

    train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds)
示例#10
0
    ffns = []

    if args.sentence:
        logger.info("Model is Sentence level")
        ffns.append(SentenceLevelTermFrequencyFeatureFunction(db,
                                                              naming=mname))
    else:
        logger.info("Model is Document level")
        ffns.append(TermFrequencyFeatureFunction(db, naming=mname))

    f = Features(mname, ffns)
    jlr = JSONLineReader()

    formatter = FEVERGoldFormatter(None,
                                   FEVERLabelSchema(),
                                   filtering=args.filtering)

    train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter)
    dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter)

    train_ds.read()
    dev_ds.read()

    test_ds = None
    if args.test is not None:
        test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter)
        test_ds.read()

    train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds)
示例#11
0
if __name__ == "__main__":

    db = FeverDocDB("data/fever/drqa.db")
    idx = set(db.get_doc_ids())

    fnc_bodies = Bodies("data/fnc-1/train_bodies.csv",
                        "data/fnc-1/competition_test_bodies.csv")
    fever_bodies = db

    f = Features(
        [FeverOrFNCTermFrequencyFeatureFunction(fever_bodies, fnc_bodies)])
    csvr = CSVReader()
    jlr = JSONLineReader()
    fnc_formatter = FNCFormatter2(FNCSimpleLabelSchema())
    fever_formatter = FEVERPredictionsFormatter(idx, FEVERLabelSchema())

    train_ds = DataSet(file="data/fnc-1/train_stances.csv",
                       reader=csvr,
                       formatter=fnc_formatter)
    dev_ds = DataSet(file="data/fnc-1/competition_test_stances.csv",
                     reader=csvr,
                     formatter=fnc_formatter)
    test_ds = DataSet(file="data/fever/fever.dev.pages.p5.jsonl",
                      reader=jlr,
                      formatter=fever_formatter)

    train_ds.read()
    test_ds.read()
    dev_ds.read()
示例#12
0
    LogHelper.get_logger(__name__)

    parser = argparse.ArgumentParser()


    parser.add_argument('db', type=str, help='/path/to/saved/db.db')
    parser.add_argument('in_file', type=str, help='/path/to/saved/db.db')
    parser.add_argument('--max_page',type=int)
    parser.add_argument('--max_sent',type=int)
    parser.add_argument('--split', type=str)
    parser.add_argument("--filtering",type=str,default=None)
    args = parser.parse_args()

    db = FeverDocDB("data/fever/fever.db")
    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(set(), FEVERLabelSchema(),filtering=args.filtering)

    train_ds = DataSet(file="data/fever/train.ns.pages.p{0}.jsonl".format(1), reader=jlr, formatter=formatter)
    dev_ds = DataSet(file="data/fever/dev.pages.p{0}.jsonl".format(args.max_page), reader=jlr, formatter=formatter)

    train_ds.read()
    dev_ds.read()

    tf = XTermFrequencyFeatureFunction(db)
    tf.inform(train_ds.data, dev_ds.data)


    jlr = JSONLineReader()
    with open(args.in_file,"r") as f:
        lines = jlr.process(f)