Python FileUtils示例，nordlys.core.utils.file_utils.FileUtils Python示例

示例#1

0

显示文件

    def __sample_file(self, dir, file):
        """Creates a local from a specific file in a given directory.

        :param dir: directory (relative to path_to_dbpedia)
        :param file:
        """
        t = Triple()
        p = NTriplesParser(t)
        infile = os.path.join(self.path_to_dbpedia, dir, file)
        outfile = os.path.join(self.output_dir, dir, file)
        print("Processing file " + file + " ...")
        i = 0
        with FileUtils.open_file_by_type(infile) as fin:
            fout = FileUtils.open_file_by_type(
                outfile,
                mode="w")  # output file will be of the same type as the input
            for line in fin:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue
                subj = self.prefix.get_prefixed(
                    t.subject())  # prefixing subject
                if subj in self.sample_entities:
                    fout.write(line)
                i += 1
                if i % 100000 == 0:
                    print(str(i // 1000) + "K lines processed")
            fout.close()

示例#2

0

显示文件

    def batch_identification(self):
        """Annotates, in a batch, queries with identified target types, and outputs results."""
        queries = json.load(FileUtils.open_file_by_type(self.__query_file))

        f_trec_out = None
        if "trec_output_file" in self.__config:  # for TREC-formatted outputting
            f_trec_out = FileUtils.open_file_by_type(
                self.__config["trec_output_file"], mode="w")

        results = dict()
        for query_id in sorted(queries):
            PLOGGER.info("Identifying target types for [{}] {}".format(
                query_id, queries[query_id]))
            results[query_id] = self.identify(queries[query_id])

            # Output resulting scores in TREC format if required
            if f_trec_out:
                type_to_score = dict()
                for d in results.get(query_id, {}).get("results", {}).values():
                    type_to_score[d["type"]] = d["score"]
                ret_res = RetrievalResults(type_to_score)
                ret_res.write_trec_format(query_id,
                                          self.__config["run_id"],
                                          f_trec_out,
                                          max_rank=self.__config["num_docs"])

        json.dump(results,
                  FileUtils.open_file_by_type(self.__output_file, mode="w"),
                  indent=4,
                  sort_keys=True)
        PLOGGER.info("Output file: {}".format(self.__output_file))

        if f_trec_out:
            f_trec_out.close()

示例#3

0

显示文件

文件： wsdmcup_ids.py 项目： wsdm-cup-2017/cress

    def __init__(self):
        self.persons_ids = {}
        self.ids_persons = {}
        for line in FileUtils.read_file_as_list(PERSONS_IDS_F):
            person, id = line.split("\t", maxsplit=1)
            self.persons_ids[person] = id
            self.ids_persons[id] = person

        self.professions_ids = {}
        self.ids_professions = {}
        for line in FileUtils.read_file_as_list(PROFESSIONS_IDS_F):
            prof, id = line.split("\t", maxsplit=1)
            self.professions_ids[prof] = id
            self.ids_professions[id] = prof

        self.nationalities_ids = {}
        self.ids_nationalities = {}
        for line in FileUtils.read_file_as_list(NATIONALITIES_IDS_F):
            nation, id = line.split("\t", maxsplit=1)
            self.nationalities_ids[nation] = id
            self.ids_nationalities[id] = nation

        self.nationalities_countries = {}
        self.countries_nationalities = {}
        for line in FileUtils.read_file_as_list(COUNTRIES_NATIONALITIES_F):
            country, nation = line.split("\t", maxsplit=1)
            self.nationalities_countries[nation] = country
            self.countries_nationalities[country] = nation

示例#4

0

显示文件

文件： config.py 项目： zxlzr/nordlys

def load_nordlys_config(file_name):
    """Loads nordlys config file. If local file is provided, global one is ignored."""
    config_path = os.sep.join([BASE_DIR, "config"])
    local_config = os.sep.join([config_path, "local", file_name])
    if os.path.exists(local_config):
        return FileUtils.load_config(local_config)
    else:
        return FileUtils.load_config(os.sep.join([config_path, file_name]))

示例#5

0

显示文件

文件： feat_freq.py 项目： wsdm-cup-2017/cress

    def generate_features(self, kb_file, output_file1, output_file2):
        """Generate features of freq-person-nationality"""

        fout1 = open(output_file1, "w")
        fout2 = open(output_file2, "w")

        # write tsv header
        header = ["person", "nationality", "freq_person_nationality_noun"]
        fout1.write("\t".join(header) + "\n")
        header = ["person", "nationality", "freq_person_nationality_adj"]
        fout2.write("\t".join(header) + "\n")

        with FileUtils.open_file_by_type(kb_file) as kb_f:
            line_count = 1
            for line in kb_f:
                print(line_count)
                line_count += 1
                person_id, nat_id, noun, adj = line.strip().split("\t")
                values_noun = [person_id, nat_id]
                values_adj = [person_id, nat_id]
                nats = [noun, adj]
                fpn_noun, fpn_adj = self.get_per_nat_tf(person_id, nats)
                values_noun.append(str(fpn_noun))
                values_adj.append(str(fpn_adj))
                fout1.write("\t".join(values_noun) + "\n")
                fout2.write("\t".join(values_adj) + "\n")
        fout1.close()
        fout2.close()

示例#6

0

显示文件

文件： indexer_dbpedia_types.py 项目： zxlzr/nordlys

    def __load_entity_abstracts(self, filename):
        prefix = URIPrefix()
        t = Triple()
        p = NTriplesParser(t)
        lines_counter = 0
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            # basic line parsing
            line = line.decode("utf-8") if isinstance(line, bytes) else line
            try:
                p.parsestring(line)
            except ParseError:  # skip lines that couldn't be parsed
                continue
            if t.subject() is None:  # only if parsed as a triple
                continue

            # Subject and object identification
            subj = prefix.get_prefixed(t.subject())
            obj = ""
            if type(t.object()) is URIRef:
                # PLOGGER.error("Error: it is URIRef the parsed obj")
                pass
            else:
                obj = t.object().encode("utf-8")
                if len(obj) == 0:
                    continue  # skip empty objects
            self.__entity_abstracts[subj] = obj

            lines_counter += 1
            if lines_counter % 10000 == 0:
                PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                pass

        PLOGGER.info("\n### Loading entity abstracts... Done.")

示例#7

0

显示文件

    def __load_items_stats(self, items_tfidf_fpath):
        """Loads pre-computed tf-idf stats for items.

        :param items_tfidf_fpath:
        :return:
        """
        item_term_weights = {}

        for line in FileUtils.read_file_as_list(items_tfidf_fpath):
            item, term, _, _, weight = line.split("\t", maxsplit=4)
            if len(term) < 4:  # avoid short terms
                continue

            # some cleanings
            if term.startswith("_"):
                term = term[1:]
            if term.endswith("_"):
                term = term[:-1]
            if term.startswith(WP_PREFIX):  # it's a person name
                term = term.split(WP_PREFIX)[-1]  # remove prefix
                term = "_".join([
                    word[0].upper() + word[1:] for word in term.split("_")
                ])  # capitalize every word

            item_d = item_term_weights.get(item, {})
            item_d[term] = float(weight)
            item_term_weights[item] = item_d

        return item_term_weights

示例#8

0

显示文件

文件： retrieval.py 项目： zxlzr/nordlys

def main(args):
    s_t = time.time()  # start time

    config = FileUtils.load_config(args.config) if args.config != "" else get_config()
    r = Retrieval(config)
    r.batch_retrieval()

    e_t = time.time()  # end time
    print("Execution time(min):\t" + str((e_t - s_t) / 60) + "\n")

示例#9

0

显示文件

def main(args):
    config = FileUtils.load_config(args.config)
    el = EL(config, Entity())

    if args.query:
        res = el.link(args.query)
        pprint(res)
    else:
        el.batch_linking()

示例#10

0

显示文件

def main(args):
    config = FileUtils.load_config(args.config)
    tti = TTI(config)

    if args.query:
        res = tti.identify(args.query)
        pprint(res)
    else:
        tti.batch_identification()

示例#11

0

显示文件

文件： er.py 项目： zxlzr/nordlys

def main(args):
    config = FileUtils.load_config(args.config)
    er = ER(config, ElasticCache(DBPEDIA_INDEX))

    if args.query:
        res = er.retrieve(args.query)
        pprint(res)
    else:
        er.batch_retrieval()

示例#12

0

显示文件

def main(args):
    config = FileUtils.load_config(args.config)
    er = ER(config)

    if args.query:
        res = er.retrieve(args.query)
        pprint(res)
    else:
        er.batch_retrieval()

示例#13

0

显示文件

文件： make_item_ids.py 项目： wsdm-cup-2017/cress

def make_persons_fb_ids(persons_fpath):
    """Our person ID is the Freebase ID where the prefix 'm.' is replaced with 'fb_'.

    :param persons_fpath: 'persons'-named file path.
    """
    with open(PERSONS_IDS_F, "w") as f_out:
        for line in FileUtils.read_file_as_list(persons_fpath):
            person, raw_fb_id = line.split("\t", maxsplit=1)
            fb_id = "fb_" + raw_fb_id[2:]
            f_out.write("{}\t{}\n".format(person, fb_id))

示例#14

0

显示文件

文件： make_item_ids.py 项目： wsdm-cup-2017/cress

def make_professions_kb_translation(dest_translation_fpath, persons_ids_fpath, professions_ids_fpath,
                                    person_items_fpath):
    """A person ID -to- item ID translation schema is convenient.

    :param dest_translation_fpath: destination file path of IDs translation.
    :param persons_ids_fpath: person IDs file path.
    :param professions_ids_fpath: profession IDs file path.
    :param person_items_fpath: path to the file with person items (a '.kb'-extension file).
    :return:
    """
    persons_ids = {}
    for line in FileUtils.read_file_as_list(persons_ids_fpath):
        person, id = line.split("\t", maxsplit=1)
        persons_ids[person] = id

    professions_ids = {}
    for line in FileUtils.read_file_as_list(professions_ids_fpath):
        prof, id = line.split("\t", maxsplit=1)
        professions_ids[prof] = id

    person_items = {}
    for line in FileUtils.read_file_as_list(person_items_fpath):
        person, item = line.split("\t", maxsplit=1)
        person_items.get(person, []).append(item)

    translations = {}
    for person, items in person_items.items():
        if person not in persons_ids:
            continue
        person_id = persons_ids[person]
        items_ids = []
        for item in items:
            if item not in professions_ids:
                continue
            items_ids.append(professions_ids[item])

        translations[person_id] = items_ids

    with open(dest_translation_fpath, "w") as f_out:
        for person_id, items_ids in translations.items():
            for item_id in items_ids:
                f_out.write("{}\t{}\n".format(person_id, item_id))

示例#15

0

显示文件

文件： feat_termstats.py 项目： wsdm-cup-2017/cress

    def generate_features(self, kb_file, output_file):
        """Generating features related to term statistics"""

        fout = open(output_file, "w")

        # write tsv header
        header = ["person", "profession"]
        for k in self.K_VALUES:
            header.append("sumProfTerms_" + str(k))
            header.append("simCos_" + str(k))
        fout.write("\t".join(header) + "\n")

        with FileUtils.open_file_by_type(kb_file) as kb_f:
            for line in kb_f:
                person_id, prof_id = line.strip().split("\t")
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # Compute sumProfTerms: \sum_{t \in T_k(pr)}\sum_{s \in S(pe)} tf(t,s) w(t,pr)
                    # where w(t,pe )= TFIDF(t,pr) = \frac{\sum_{s \in S(pr)} tf(t,s)}
                    sum_prof_terms = 0
                    for term, tf in person_tf.items():
                        pt = self.__stats.get(prof_id, {}).get(term, {})
                        if pt.get("rank",
                                  100000) > k:  # skip term if not in top-K
                            continue
                        sum_prof_terms += tf * pt.get("tfidf", 0)
                    values.append(str(sum_prof_terms))

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    vec_pr = []  # construct prof vector
                    vec_pe = []  # construct person vector

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                vec_pr.append(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                vec_pe.append(person_tf.get(term, 0) * idf)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                fout.write("\t".join(values) + "\n")
                print(values)

        fout.close()

示例#16

0

显示文件

文件： indexer_dbpedia_types.py 项目： zxlzr/nordlys

def main(args):
    config = FileUtils.load_config(args.config)

    type2entity_file = os.path.expanduser(os.path.join(config.get("type2entity_file", "")))
    entity_abstracts_file = os.path.expanduser(os.path.join(config.get("entity_abstracts_file", "")))
    if (not os.path.isfile(type2entity_file)) or (not os.path.isfile(entity_abstracts_file)):
        exit(1)

    indexer = IndexerDBpediaTypes(config)
    indexer.build_index(force=True)
    PLOGGER.info("Index build: <{}>".format(indexer.name))

示例#17

0

显示文件

文件： nt2mongo.py 项目： ageron/nordlys

    def add_file(self, filename, reverse_triple=False, predicate_prefix=None):
        """Adds contents from an NTriples file to MongoDB.

        :param filename: NTriples file.
        :param reverse_triple: if set True, the subject and object values are swapped.
        :param predicate_prefix: prefix to be added to predicates.
        :param subjects_redirecter: redirects dict.
        """
        print("Processing " + filename + "...")

        t = Triple()
        p = NTriplesParser(t)
        self.__m_id = None  # document id for MongoDB -- subj
        self.__m_contents = None  # document contents for MongoDB -- pred, obj
        i = 0

        with FileUtils.open_file_by_type(filename) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # subject prefixing
                subj = self.__prefix.get_prefixed(t.subject())

                # predicate prefixing
                pred = self.__prefix.get_prefixed(t.predicate())
                if predicate_prefix is not None:
                    pred = predicate_prefix + pred

                # Object prefixing
                if type(t.object()) is URIRef:
                    obj = self.__prefix.get_prefixed(t.object())
                else:
                    obj = t.object()
                    if len(obj) == 0:
                        continue  # skip empty objects

                # write or append
                if reverse_triple:  # reverse subj and obj
                    self._next_triple(obj, pred, subj)
                else:  # normal mode
                    self._next_triple(subj, pred, obj)

                i += 1
                if i % 100000 == 0:
                    print(
                        str(i // 1000) + "K lines processed from " + filename)

        # process last triple
        self._write_to_mongo()

示例#18

0

显示文件

文件： make_item_ids.py 项目： wsdm-cup-2017/cress

def make_relation_item_ids(rel_items_fpath):
    """Our relation ID is the original relation where the prefix any dash or blank is replaced with underscore.

    :param rel_items_fpath: items file path.
    """
    basename = os.path.basename(rel_items_fpath)  # professions or nationalities

    with open(os.sep.join([DATA_DIR, "{}_ids.tsv".format(basename)]), "w") as f_out:
        for item in FileUtils.read_file_as_list(rel_items_fpath):
            id = item.lower().replace(" ", "_").replace("-", "_")
            f_out.write("{}\t{}\n".format(item, id))

示例#19

0

显示文件

文件： el.py 项目： theVoogie/nordlys

def main(args):
    conf = FileUtils.load_config(args.config)
    el = EL(conf, Entity(), ElasticCache(DBPEDIA_INDEX), FeatureCache())

    if conf.get("gen_model", False):
        LTR.train(conf)
    elif args.query:
        res = el.link(args.query)
        pprint(res)
    else:
        el.batch_linking()

示例#20

0

显示文件

文件： word2vec2mongo.py 项目： zxlzr/nordlys

    def build(self):
        """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        infile = FileUtils.open_file_by_type(self.__w2v_fname)
        i = 0
        for line in infile:
            term, vector = self.__parse_line(line)
            self.__mongo.add(term, {'vector': vector})
            i += 1
            if i % 1000 == 0:
                PLOGGER.info(str(i / 1000) + "K lines are loaded.")

示例#21

0

显示文件

文件： query_diff.py 项目： ageron/nordlys

    def dump_differences(self, output_file):
        """Outputs query-level differences between two methods into a tab-separated file.

        The first method is considered the baseline, the differences are with respect to that.
        Output format: queryID res1 res2 diff(res2-res1)
        """
        te_method1 = TrecEval()
        te_method1.evaluate(self.__qrels, self.__run1_file)
        te_method2 = TrecEval()
        te_method2.evaluate(self.__qrels, self.__run2_file)
        data = []
        for query_id in te_method1.get_query_ids():
            res1 = te_method1.get_score(query_id, self.__metric)
            res2 = te_method2.get_score(query_id, self.__metric)
            data.append([query_id, res1, res2, round(res2 - res1, 4)])

        # sorts based on the differences desc
        sorted_data = sorted(data, key=lambda l: l[3], reverse=True)

        FileUtils.dump_tsv(output_file,
                           sorted_data,
                           header=["queryID", "method1", "method2", "diff"])

示例#22

0

显示文件

文件： feat_w2v_sim.py 项目： wsdm-cup-2017/cress

    def generate_features(self, kb_file, output_file):
        """Core function for generating into output_file the features, with person-item data from kb_file.

        :param kb_file: path to the file with person items (a '.kb'-extension file).
        :param output_file:
        :return:
        """
        feat_w2v_approx = FeaturesW2VSimApprox()

        with open(output_file, "w") as f_out:
            # write tsv header
            header = ["person_id", "prof_id"]
            for k in self.K_VALUES:
                header.append("simCos_w2v_" + str(k))
            f_out.write("\t".join(header) + "\n")

            for line in FileUtils.read_file_as_list(kb_file):
                person_id, prof_id = line.split(
                    "\t")  # strip() done in read_file_as_list()
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    term_weights_pr = {
                    }  # dict from top-K profession terms to their tfidf weights
                    term_weights_pe = {
                    }  # dict from top-K person terms to their tfidf weights

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                term_weights_pr[term] = float(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                term_weights_pe[term] = person_tf.get(term,
                                                                      0) * idf

                        vec_pr = feat_w2v_approx.get_vector(term_weights_pr)
                        vec_pe = feat_w2v_approx.get_vector(term_weights_pe)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                f_out.write("\t".join(values) + "\n")

示例#23

0

显示文件

def main(args):
    config = FileUtils.load_config(args.config)
    if "_uri" not in config["index_name"]:
        print("index name might not be correct, please check again!")
        exit(0)

    indexer = IndexerDBpediaURI(config)

    fields_file = config.get("fields_file", "output/field_counts.json")
    if "fields_file" not in config:
        field_counts2json(fields_file)

    indexer.build()
    print("Index build: " + config["index_name"])

示例#24

0

显示文件

def main(args):
    config = FileUtils.load_config(args.config)
    dbpedia_path = config.get("dbpedia_files_path", "")
    # Check DBpedia files
    PLOGGER.info("Checking needed DBpedia files under {}".format(dbpedia_path))
    for fname in [ENTITY_ABSTRACTS_FILE] + ENTITY_TYPES_FILES:
        if os.path.isfile(os.sep.join([dbpedia_path, fname])):
            PLOGGER.info("  - {}: OK".format(fname))
        else:
            PLOGGER.error("  - {}: Missing".format(fname))
            exit(1)

    indexer = IndexerDBpediaTypes(config)
    indexer.build_index(force=True)

示例#25

0

显示文件

    def __load_entity_abstracts(self):
        num_lines = 0
        filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE])
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            entity, abstract = self.__parse_line(line)
            if abstract and len(abstract) > 0:  # skip empty objects
                self.__entity_abstracts[entity] = abstract

            num_lines += 1
            if num_lines % 10000 == 0:
                PLOGGER.info("  {}K lines processed".format(num_lines // 1000))

        PLOGGER.info("  Done.")

示例#26

0

显示文件

文件： indexer_dbpedia_uri.py 项目： zxlzr/nordlys

def main(args):
    config = FileUtils.load_config(args.config)
    if "_uri" not in config["index_name"]:
        PLOGGER.error("index name might not be correct, please check again!")
        exit(0)

    if "fields_file" not in config:
        fields_count = compute_field_counts()
    else:
        fields_count = json.load(config["fields_file"])

    indexer = IndexerDBpediaURI(config, fields_count)

    indexer.build()
    PLOGGER.info("Index build: " + config["index_name"])

示例#27

0

显示文件

文件： scorer_elr.py 项目： zxlzr/nordlys

def main(args):
    config = FileUtils.load_config(args.config)
    elastic_term = ElasticCache(config["text_index"])
    lambdas = config.get("lambdas", [0.9, 0.1])

    queries = json.load(open(config["query_file"], "r"))
    mappings = json.load(open(config["mapping_file"], "r"))
    annots = load_annot(config["annot_file"])
    run = load_run(config["run_file"])

    instances = Instances()
    # gets the results
    out_file = open(config["output_file"], "w")
    qid_int = 0
    for qid, query in sorted(queries.items()):
        print("Scoring ", qid, "...")
        results, libsvm_str = {}, ""
        query_len = len(elastic_term.analyze_query(query).split())
        scorer = ScorerELR(ElasticCache(config["uri_index"]), annots[qid],
                           query_len, lambdas)
        for doc_id, p_T_d in sorted(run[qid].items()):
            query_mappings = get_mapping_query(annots[qid], mappings)
            p_E_d = scorer.score_doc(doc_id, query_mappings)
            properties = {
                'doc_id': doc_id,
                'query': query,
                'qid': qid,
                'qid_int': qid_int
            }
            features = {'p_T_d': p_T_d, 'p_E_d': p_E_d}
            ins = Instance(qid + "_" + doc_id,
                           features=features,
                           properties=properties)
            instances.add_instance(ins)
            # libsvm_str += ins.to_libsvm(qid_prop="qod_int")
            results[doc_id] = (lambdas[0] * p_T_d) + (lambdas[1] * p_E_d)
        qid_int += 1

        # Write trec format
        out_str = trec_format(results, qid, "elr")
        out_file.write(out_str)

    out_file.close()
    print("Output file:", config["output_file"])
    instances.to_json(config["json_file"])
    print("Output file:", config["json_file"])

示例#28

0

显示文件

def main(args):
    config = FileUtils.load_config(args.config)

    type2entity_file = os.path.expanduser(config.get("type2entity_file", ""))
    if not os.path.isfile(type2entity_file):
        print("invalid path to type-to-entity source file: ", type2entity_file)
        exit(1)

    entity_abstracts_file = os.path.expanduser(
        config.get("entity_abstracts_file", ""))
    if not os.path.isfile(entity_abstracts_file):
        print("invalid path to entity abstracts source file: ",
              entity_abstracts_file)
        exit(1)

    indexer = IndexerDBpediaTypes(config, type2entity_file,
                                  entity_abstracts_file)
    indexer.build_index(force=True)
    print("Index build: <{}>".format(indexer.name))

示例#29

0

显示文件

def main(args):
    example_config = {"index_name": "toy_index",
                      # "query_file": "data/queries/test_queries.json",
                      "first_pass": {
                          "num_docs": 1000,
                          "field": "content",
                          # "model": "LMJelinekMercer",
                          # "model_params": {"lambda": 0.1}
                      },
                      "second_pass": {
                          "field": "content",
                          "model": "lm",
                          "smoothing_method": "jm",
                          "smoothing_param": 0.1
                      },
                      "output_file": "output/test_retrieval.txt"
                      }
    config = FileUtils.load_config(args.config) if args.config != "" else example_config
    r = Retrieval(config)
    r.batch_retrieval()

示例#30

0

显示文件

    def read_fb2dbp_file(self, is_39=False):
        """Reads the file and generates an initial mapping of Freebase to DBpedia IDs.
        Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored.
        """
        fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file
        print("Processing " + fb2dbp_file + "...")

        t = Triple()
        p = NTriplesParser(t)
        i = 0
        fb2dbp_mapping = defaultdict(set)
        with FileUtils.open_file_by_type(fb2dbp_file) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # prefixing
                dbp_id = self.__prefix.get_prefixed(t.subject())
                fb_id = self.__prefix.get_prefixed(t.object())

                # if reading 3.9 file, converts ID to 2015-10 version
                if is_39:
                    dbp_id = EntityUtils.convert_39_to_201510(dbp_id)
                    fb2dbp_mapping[fb_id].add(dbp_id)

                # if reading 2015-10 file, keeps only the proper DBpedia entities
                else:
                    entity_utils = EntityUtils(
                        self.__mongo_dbpedia.find_by_id(dbp_id))
                    if entity_utils.is_entity():
                        fb2dbp_mapping[fb_id].add(dbp_id)
                i += 1
                if i % 1000 == 0:
                    print(str(i // 1000) + "K lines are processed!")

        return fb2dbp_mapping