Exemplo n.º 1
0
    def __init__(self):
        self.persons_ids = {}
        self.ids_persons = {}
        for line in FileUtils.read_file_as_list(PERSONS_IDS_F):
            person, id = line.split("\t", maxsplit=1)
            self.persons_ids[person] = id
            self.ids_persons[id] = person

        self.professions_ids = {}
        self.ids_professions = {}
        for line in FileUtils.read_file_as_list(PROFESSIONS_IDS_F):
            prof, id = line.split("\t", maxsplit=1)
            self.professions_ids[prof] = id
            self.ids_professions[id] = prof

        self.nationalities_ids = {}
        self.ids_nationalities = {}
        for line in FileUtils.read_file_as_list(NATIONALITIES_IDS_F):
            nation, id = line.split("\t", maxsplit=1)
            self.nationalities_ids[nation] = id
            self.ids_nationalities[id] = nation

        self.nationalities_countries = {}
        self.countries_nationalities = {}
        for line in FileUtils.read_file_as_list(COUNTRIES_NATIONALITIES_F):
            country, nation = line.split("\t", maxsplit=1)
            self.nationalities_countries[nation] = country
            self.countries_nationalities[country] = nation
Exemplo n.º 2
0
    def __load_items_stats(self, items_tfidf_fpath):
        """Loads pre-computed tf-idf stats for items.

        :param items_tfidf_fpath:
        :return:
        """
        item_term_weights = {}

        for line in FileUtils.read_file_as_list(items_tfidf_fpath):
            item, term, _, _, weight = line.split("\t", maxsplit=4)
            if len(term) < 4:  # avoid short terms
                continue

            # some cleanings
            if term.startswith("_"):
                term = term[1:]
            if term.endswith("_"):
                term = term[:-1]
            if term.startswith(WP_PREFIX):  # it's a person name
                term = term.split(WP_PREFIX)[-1]  # remove prefix
                term = "_".join([
                    word[0].upper() + word[1:] for word in term.split("_")
                ])  # capitalize every word

            item_d = item_term_weights.get(item, {})
            item_d[term] = float(weight)
            item_term_weights[item] = item_d

        return item_term_weights
Exemplo n.º 3
0
    def __load_entity_abstracts(self, filename):
        prefix = URIPrefix()
        t = Triple()
        p = NTriplesParser(t)
        lines_counter = 0
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            # basic line parsing
            line = line.decode("utf-8") if isinstance(line, bytes) else line
            try:
                p.parsestring(line)
            except ParseError:  # skip lines that couldn't be parsed
                continue
            if t.subject() is None:  # only if parsed as a triple
                continue

            # Subject and object identification
            subj = prefix.get_prefixed(t.subject())
            obj = ""
            if type(t.object()) is URIRef:
                # PLOGGER.error("Error: it is URIRef the parsed obj")
                pass
            else:
                obj = t.object().encode("utf-8")
                if len(obj) == 0:
                    continue  # skip empty objects
            self.__entity_abstracts[subj] = obj

            lines_counter += 1
            if lines_counter % 10000 == 0:
                PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                pass

        PLOGGER.info("\n### Loading entity abstracts... Done.")
Exemplo n.º 4
0
def make_persons_fb_ids(persons_fpath):
    """Our person ID is the Freebase ID where the prefix 'm.' is replaced with 'fb_'.

    :param persons_fpath: 'persons'-named file path.
    """
    with open(PERSONS_IDS_F, "w") as f_out:
        for line in FileUtils.read_file_as_list(persons_fpath):
            person, raw_fb_id = line.split("\t", maxsplit=1)
            fb_id = "fb_" + raw_fb_id[2:]
            f_out.write("{}\t{}\n".format(person, fb_id))
Exemplo n.º 5
0
def make_professions_kb_translation(dest_translation_fpath, persons_ids_fpath, professions_ids_fpath,
                                    person_items_fpath):
    """A person ID -to- item ID translation schema is convenient.

    :param dest_translation_fpath: destination file path of IDs translation.
    :param persons_ids_fpath: person IDs file path.
    :param professions_ids_fpath: profession IDs file path.
    :param person_items_fpath: path to the file with person items (a '.kb'-extension file).
    :return:
    """
    persons_ids = {}
    for line in FileUtils.read_file_as_list(persons_ids_fpath):
        person, id = line.split("\t", maxsplit=1)
        persons_ids[person] = id

    professions_ids = {}
    for line in FileUtils.read_file_as_list(professions_ids_fpath):
        prof, id = line.split("\t", maxsplit=1)
        professions_ids[prof] = id

    person_items = {}
    for line in FileUtils.read_file_as_list(person_items_fpath):
        person, item = line.split("\t", maxsplit=1)
        person_items.get(person, []).append(item)

    translations = {}
    for person, items in person_items.items():
        if person not in persons_ids:
            continue
        person_id = persons_ids[person]
        items_ids = []
        for item in items:
            if item not in professions_ids:
                continue
            items_ids.append(professions_ids[item])

        translations[person_id] = items_ids

    with open(dest_translation_fpath, "w") as f_out:
        for person_id, items_ids in translations.items():
            for item_id in items_ids:
                f_out.write("{}\t{}\n".format(person_id, item_id))
Exemplo n.º 6
0
def make_relation_item_ids(rel_items_fpath):
    """Our relation ID is the original relation where the prefix any dash or blank is replaced with underscore.

    :param rel_items_fpath: items file path.
    """
    basename = os.path.basename(rel_items_fpath)  # professions or nationalities

    with open(os.sep.join([DATA_DIR, "{}_ids.tsv".format(basename)]), "w") as f_out:
        for item in FileUtils.read_file_as_list(rel_items_fpath):
            id = item.lower().replace(" ", "_").replace("-", "_")
            f_out.write("{}\t{}\n".format(item, id))
Exemplo n.º 7
0
    def __load_entity_abstracts(self):
        num_lines = 0
        filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE])
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            entity, abstract = self.__parse_line(line)
            if abstract and len(abstract) > 0:  # skip empty objects
                self.__entity_abstracts[entity] = abstract

            num_lines += 1
            if num_lines % 10000 == 0:
                PLOGGER.info("  {}K lines processed".format(num_lines // 1000))

        PLOGGER.info("  Done.")
Exemplo n.º 8
0
    def generate_features(self, kb_file, output_file):
        """Core function for generating into output_file the features, with person-item data from kb_file.

        :param kb_file: path to the file with person items (a '.kb'-extension file).
        :param output_file:
        :return:
        """
        feat_w2v_approx = FeaturesW2VSimApprox()

        with open(output_file, "w") as f_out:
            # write tsv header
            header = ["person_id", "prof_id"]
            for k in self.K_VALUES:
                header.append("simCos_w2v_" + str(k))
            f_out.write("\t".join(header) + "\n")

            for line in FileUtils.read_file_as_list(kb_file):
                person_id, prof_id = line.split(
                    "\t")  # strip() done in read_file_as_list()
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    term_weights_pr = {
                    }  # dict from top-K profession terms to their tfidf weights
                    term_weights_pe = {
                    }  # dict from top-K person terms to their tfidf weights

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                term_weights_pr[term] = float(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                term_weights_pe[term] = person_tf.get(term,
                                                                      0) * idf

                        vec_pr = feat_w2v_approx.get_vector(term_weights_pr)
                        vec_pe = feat_w2v_approx.get_vector(term_weights_pe)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                f_out.write("\t".join(values) + "\n")
Exemplo n.º 9
0
    def __load_entity_types(self):
        num_lines = 0
        for types_file in ENTITY_TYPES_FILES:
            filename = os.sep.join([self.__dbpedia_path, types_file])
            PLOGGER.info("Loading entity types from {}".format(filename))
            for line in FileUtils.read_file_as_list(filename):
                entity, entity_type = self.__parse_line(line)
                if type(entity_type) != str:  # Likely result of parsing error
                    continue
                if not entity_type.startswith("<dbo:"):
                    PLOGGER.info("  Non-DBpedia type: {}".format(entity_type))
                    continue
                if not entity.startswith("<dbpedia:"):
                    PLOGGER.info("  Invalid entity: {}".format(entity))
                    continue
                self.__types_entities[entity_type].append(entity)

                num_lines += 1
                if num_lines % 10000 == 0:
                    PLOGGER.info("  {}K lines processed".format(num_lines //
                                                                1000))
            PLOGGER.info("  Done.")
Exemplo n.º 10
0
    def create_pdf(self,
                   diff_file,
                   pdf_file,
                   title="",
                   xlabel="",
                   ylabel="",
                   aspect_ratio="equal",
                   separator="\t"):
        """Create bar plot for differences in pdf.
        
        This function is used to load difference .csv file, 
        create bar plot and store as a pfd file.

        :pdf: created and saved pdf file       
        """
        data = FileUtils.read_file_as_list(diff_file)

        scores = []
        for item in data:
            if "diff" in item:  # ignore the first line(title)
                continue
            scores.append(float(item.split(separator)[3]))

        scores = sorted(scores, reverse=True)

        with PdfPages(pdf_file) as pdf:
            n = len(scores)
            x = range(n)
            plt.figure(figsize=(4, 4))
            plt.title(title)
            plt.xlabel(xlabel)
            plt.ylabel(ylabel)
            plt.bar(x, scores, self.width, color=self.color)
            plt.tight_layout()  # warning,still working
            pdf.savefig()
            plt.close()
Exemplo n.º 11
0
 def __load_sample_entities(self):
     """Loads the set of entities to be sampled from file."""
     self.sample_entities = FileUtils.read_file_as_list(self.entities_file)
Exemplo n.º 12
0
    def get_all_features_approx(self, all_items_fpath, items_tfidf_fpath,
                                first_snippets_fpath, person_items_fpath,
                                dest_fpath):
        """Core function for generating into output_file the features, with person-item data from kb_file.

        :param all_items_fpath: path to the file with all items (e.g. 'professions' named file).
        :param items_tfidf_fpath: path to the file with item tf-idf stats.
        :param first_snippets_fpath: path to the file with first snippets.
        :param person_items_fpath: path to the file with person items (a '.kb'-extension file).
        :param dest_fpath: destination file path.
        :return:
        """
        wcup_ids = WSDMCupIDs()

        # -------
        # Get persons vectors

        # Load person-to-snippet mapping
        persons_to_vec = {}
        for line in FileUtils.read_file_as_list(first_snippets_fpath):
            person, snippet = line.split("\t")
            person = person.split(
                "<{}".format(DBPEDIA_URI_PREFIX))[-1].split(">")[0].replace(
                    "_", " ")

            c = {
                k: v
                for k, v in Counter(snippet.split()).items()
                if k not in STOPWORDS and v >= 2
            }  # min freq = 2
            total = sum([v for k, v in c.items()])
            c = {k: round(v / total, 4) for k, v in c.items()}
            v = self.get_vector(c)
            persons_to_vec[person] = v
        print("Get persons vectors... OK.")

        # -------
        # Get profession vectors

        # Load professions tf-idf
        # dict from each item (e.g. professions) to its term-to-weight dict
        item_term_weights = self.__load_items_stats(items_tfidf_fpath)
        prof_to_vec = {}
        for raw_item in FileUtils.read_file_as_list(all_items_fpath):
            # for professions
            item = wcup_ids.get_id_from_prof(raw_item)  # DON'T FORGET!
            v = self.get_vector(item_term_weights.get(item, {}))

            # ---
            # Alternative vector version, using the w2v vector when it exists, without top-K terms
            #
            # if not self.word2vec.contains_word(item):  # build our own vector with top-K terms for that item
            #     v = self.get_vector(item_term_weights.get(item, {}))
            # else:  # take advantage of an already well-represented vector present in the w2v collection
            #     v = self.word2vec.get_vector(item)
            # ---
            prof_to_vec[item] = v
        print("Get profession vectors... OK.")

        # -------
        # Load persons-to-professions and get cos sims
        person_to_prof_to_cos = {}  # e.g. person-to-profession-to-cos_sim
        for line in FileUtils.read_file_as_list(person_items_fpath):
            person, item = line.split("\t", maxsplit=1)
            item = wcup_ids.get_id_from_prof(item)  # DON'T FORGET!

            if person not in person_to_prof_to_cos:
                person_to_prof_to_cos[person] = {}

            person_vec = persons_to_vec.get(
                person,
                self.word2vec.get_zeros_vector(self.word2vec.dimension))
            prof_vec = prof_to_vec.get(
                item, self.word2vec.get_zeros_vector(self.word2vec.dimension))

            feat_value = cos_sim(person_vec, prof_vec)
            person_to_prof_to_cos[person][item] = feat_value
        pass

        with open(dest_fpath, "w") as f_out:
            f_out.write("person\tprofession\tsimCos_w2v_aggr_100\n")
            for person, data in sorted(person_to_prof_to_cos.items()
                                       ):  # sorted by person full name
                for item, sim in sorted(
                        data.items(), key=itemgetter(1),
                        reverse=True):  # sorted decreasingly by sim
                    f_out.write("{}\t{}\t{}\n".format(
                        wcup_ids.get_id_from_person(person), item, sim))
Exemplo n.º 13
0
def main(all_items_fpath, first_snippets_fpath, person_items_fpath, relation):
    """Extracts features about 1st Wikipedia sentences and paragraphs.

    :param all_items_fpath: path to the file with all items (e.g. 'professions' named file).
    :param first_snippets_fpath:path to the file with first snippets.
    :param person_items_fpath: path to the file with person items (a '.kb'-extension file).
    :param relation: a string in {REL_PROFESSION, REL_NATIONALITY}.
    :return:
    """

    snippet_basename = os.path.basename(first_snippets_fpath).split(".txt")[
        0]  # for creating dump filename
    print("Processing for {}".format(snippet_basename))

    # Load all items, e.g. all professions
    all_rel_items = {
        item
        for item in FileUtils.read_file_as_list(all_items_fpath)
    }

    # Load person-to-snippet mapping
    persons_snippets = {}
    for line in FileUtils.read_file_as_list(first_snippets_fpath):
        person, snippet = line.split("\t")
        person = person.split(
            "<{}".format(DBPEDIA_URI_PREFIX))[-1].split(">")[0].replace(
                "_", " ")
        persons_snippets[person] = snippet
    print("\t Loaded snippets... OK.")

    # Make dict: person to set of (all possible) items in snippet
    person_to_all_snippet_items = {}
    # person_counter = 0
    for person, snippet in persons_snippets.items():
        for item in all_rel_items:
            # important to lowercase when searching for item
            match = re.search(r"[,\s\.]({})[,\s\.]".format(item.lower()),
                              snippet)
            if match:
                person_to_all_snippet_items.get(person, set()).add(item)
    print("\t Made 1st dict... OK.")

    # Make dict: person to first item in snippet (among all possible items)
    person_to_first_snippet_item = {}
    # person_counter = 0
    for person, snippet in persons_snippets.items():
        fst_item_index = len(snippet) - 1
        fst_item = ""
        for item in person_to_all_snippet_items.get(person, set()):
            # in this case, we use re.search() since we need the string index where the found occurrence starts
            # important to lowercase when searching for item
            match = re.search(r"[,\s\.]({})[,\s\.]".format(item.lower()),
                              snippet)
            if match:
                # item (e.g. profession) occurs not only as substring, but all its words (sometimes it's multiword)
                # appear as words in snippet
                # otherwise, it will count, e.g., "orator" as a profession when it's only a substring of "oratorio"
                item_index = match.start(1)
                if item_index < fst_item_index:
                    fst_item_index = item_index
                    fst_item = item

        person_to_first_snippet_item[person] = fst_item
    print("\t Made 2nd dict... OK.")

    # Make final feature mappings
    is_item_in = {}  # e.g. person-to-profession-to-yes_or_no mapping
    is_1st_item_in = {}  # e.g. person-to-profession-to-yes_or_no mapping
    for line in FileUtils.read_file_as_list(person_items_fpath):
        person, item = line.split("\t", maxsplit=1)

        if person not in is_item_in:
            is_item_in[person] = {}
        feat_value = "1" if item in person_to_all_snippet_items.get(
            person, set()) else "0"
        is_item_in[person][item] = feat_value

        if person not in is_1st_item_in:
            is_1st_item_in[person] = {}
        feat_value = "1" if item in person_to_first_snippet_item.get(
            person, set()) else "0"
        is_1st_item_in[person][item] = feat_value

    print("\t Main for-loop running... OK.")

    # -------
    # Dumping tsv
    persons_snippets.clear()
    wcup_ids = WSDMCupIDs()

    with open(
            os.sep.join([
                OUTPUT_DIR, "is_{}_in_{}.tsv".format(relation,
                                                     snippet_basename)
            ]), "w") as f_is_item_in:
        f_is_item_in.write("person\t{}\tis_{}_in_{}\n".format(
            relation, relation, snippet_basename))
        for person, item_val in sorted(is_item_in.items()):
            for item, val in sorted(item_val.items()):
                f_is_item_in.write("{}\t{}\t{}\n".format(
                    wcup_ids.get_id_from_person(person),
                    wcup_ids.get_id_from_prof(item), val))
    with open(
            os.sep.join([
                OUTPUT_DIR,
                "is_1st_{}_in_{}.tsv".format(relation, snippet_basename)
            ]), "w") as f_is_item_in:
        f_is_item_in.write("person\t{}\tis_the_1st_{}_in_{}\n".format(
            relation, relation, snippet_basename))
        for person, item_val in sorted(is_1st_item_in.items()):
            for item, val in sorted(item_val.items()):
                f_is_item_in.write("{}\t{}\t{}\n".format(
                    wcup_ids.get_id_from_person(person),
                    wcup_ids.get_id_from_prof(item), val))

    print("\t Dumping both tsv files... OK.")