class FACCToMongo(object): def __init__(self, config): """Inserts FACC surface forms to Mongo.""" self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__path = config[KEY_PATH] self.__predicate = config[KEY_PREDICATE] self.__lowercase = config[KEY_LOWERCASE] self.__mongo = None @staticmethod def __check_config(config): """Checks config parameters and sets default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_PATH not in config: raise Exception(KEY_PATH + " is missing") if KEY_PREDICATE not in config: raise Exception(KEY_PREDICATE + " is missing") if KEY_LOWERCASE not in config: config[KEY_LOWERCASE] = True except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1) def __add_surface_form(self, surface_form, freebase_uri, count): """Adds a surface form.""" if self.__lowercase: surface_form = surface_form.lower() # Increases count; if the id is not associated with the surface form yet, it adds it with count. freebase_id = self.__convert_to_fb_id(freebase_uri) self.__mongo.inc_in_dict(surface_form, self.__predicate, freebase_id, count) def __convert_to_fb_id(self, fb_uri): """Converts /m/047b9p0 to <fb:m.047b9p0>""" fb_id = fb_uri.replace("/", ".") return "<fb:" + fb_id[1:] + ">" def __add_file(self, tsv_filename): """Adds name variants from an FACC tsv file.""" PLOGGER.info("Adding name variants from '" + tsv_filename + "'...") infile = open(tsv_filename, "r") for line in infile: f = line.rstrip().split("\t") self.__add_surface_form(f[0], f[1], int(f[2])) infile.close() def build(self): """Builds surface form collection from FACC annotations.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() for path, dirs, files in os.walk(self.__path): for fn in files: if fn.endswith(".tsv"): self.__add_file(os.path.join(path, fn)) PLOGGER.info("Collection " + self.__collection + " is built.")
def build_collection(self, mappings): """Builds Mongo collection""" mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) mongo.drop() predicate = "!<owl:sameAs>" i = 0 for fb_id, dbp_ids in mappings.items(): for dbp_id in dbp_ids: mongo.append_set(fb_id, predicate, [dbp_id]) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities are added!")
class Word2VecToMongo(object): def __init__(self, config): self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__w2v_fname = config[KEY_MAPPING_FILE] self.__mongo = None @staticmethod def __check_config(config): """Checks params and set default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_MAPPING_FILE not in config: raise Exception(KEY_MAPPING_FILE + " is missing") if not op.exists(config[KEY_MAPPING_FILE]): raise Exception("Mapping file path does not exist.") except Exception as e: print("Error in config file: ", e) exit(1) return config def __parse_line(self, line): """ Parses a line of the plain-text GoogleNews 300-dim pre-trained corpus. :param line: :type line: string :return: a (word, vector) tuple. """ word, vec_str = line.rstrip().split(maxsplit=1) vector = [float(x) for x in vec_str.split()] return word, vector def build(self): """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() infile = FileUtils.open_file_by_type(self.__w2v_fname) i = 0 for line in infile: term, vector = self.__parse_line(line) self.__mongo.add(term, {'vector': vector}) i += 1 if i % 1000 == 0: print(str(i / 1000) + "K lines are loaded.") # break pass
class NTriplesToMongoDB(object): def __init__(self, host, db, collection): self.__mongo = Mongo(host, db, collection) self.__prefix = URIPrefix() self.__m_id = None self.__m_contents = None logging.basicConfig(level="ERROR") # no warnings from the rdf parser def _next_triple(self, subj, pred, obj): """Processes a triple. - Appends to previous triple if it's the same subject. - Otherwise inserts last triple and creates a new one. """ if (self.__m_id is not None) and (self.__m_id == subj): if pred not in self.__m_contents: self.__m_contents[pred] = [] self.__m_contents[pred].append(obj) else: self._write_to_mongo() self.__m_id = subj self.__m_contents = {pred: [obj]} def _write_to_mongo(self): """Writes triple (inserts or appends existing) to MongoDB collection.""" if self.__m_id is not None: for field, value in self.__m_contents.items(): self.__mongo.append_set(self.__m_id, field, value) # self.mongo.add(self.m_id, self.m_contents) self.__m_id = None self.__m_contents = None def drop(self): """Deletes the collection.""" self.__mongo.drop() def add_file(self, filename, reverse_triple=False, predicate_prefix=None): """Adds contents from an NTriples file to MongoDB. :param filename: NTriples file. :param reverse_triple: if set True, the subject and object values are swapped. :param predicate_prefix: prefix to be added to predicates. :param subjects_redirecter: redirects dict. """ print("Processing " + filename + "...") t = Triple() p = NTriplesParser(t) self.__m_id = None # document id for MongoDB -- subj self.__m_contents = None # document contents for MongoDB -- pred, obj i = 0 with FileUtils.open_file_by_type(filename) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # subject prefixing subj = self.__prefix.get_prefixed(t.subject()) # predicate prefixing pred = self.__prefix.get_prefixed(t.predicate()) if predicate_prefix is not None: pred = predicate_prefix + pred # Object prefixing if type(t.object()) is URIRef: obj = self.__prefix.get_prefixed(t.object()) else: obj = t.object() if len(obj) == 0: continue # skip empty objects # write or append if reverse_triple: # reverse subj and obj self._next_triple(obj, pred, subj) else: # normal mode self._next_triple(subj, pred, obj) i += 1 if i % 100000 == 0: print( str(i // 1000) + "K lines processed from " + filename) # process last triple self._write_to_mongo()
class DBpediaSurfaceforms2Mongo(object): def __init__(self, config): """Inserts DBpedia surface forms to Mongo.""" self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__lowercase = config[KEY_LOWERCASE] self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) self.__mongo = None @staticmethod def __check_config(config): """Checks config parameters and sets default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_LOWERCASE not in config: config[KEY_LOWERCASE] = True except Exception as e: print("Error in config file: ", e) sys.exit(1) def __add_surface_form(self, surface_form, predicate, entity_id): """Adds a surface form (removes the disambiguation part form the surface form, if exists). :param surface_form: surface form for entity :param predicate: predicate that entity is extracted from e.g. <rdfs:label> :param entity_id: entity ID """ if sys.getsizeof(surface_form) >= 1024: # Mongo key limit return surface_form = surface_form.replace("(disambiguation)", "").strip() if self.__lowercase: surface_form = surface_form.lower() self.__mongo.inc_in_dict(surface_form, predicate, entity_id, 1) def build_collection(self): """Adds all name variants from DBpedia.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() # iterate through all DBpedia entities i = 0 for mdoc in self.__mongo_dbpedia.find_all(): entity = EntityUtils(Mongo.unescape_doc(mdoc)) # skips entities without names if not entity.has_name(): continue surface_form = entity.get_name() # the entity is redirect page if entity.is_redirect(): entity_id = entity.get_predicate( EntityUtils.PREDICATE_REDIRECT)[0] self.__add_surface_form(surface_form, EntityUtils.PREDICATE_REDIRECT, entity_id) # the entity is disambiguation page if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE): entity_ids = entity.get_predicate( EntityUtils.PREDICATE_DISAMBIGUATE) for entity_id in entity_ids: self.__add_surface_form(surface_form, EntityUtils.PREDICATE_DISAMBIGUATE, entity_id) # entity is not a redirect/disambiguation page and has name and abstract if entity.is_entity(): entity_id = entity.get_id() # adds entity name self.__add_surface_form(surface_form, EntityUtils.PREDICATE_NAME, entity_id) # adds other entity names foaf_name_predicate = "<foaf:name>" if entity.has_predicate(foaf_name_predicate): for surface_form in entity.get_predicate( foaf_name_predicate): self.__add_surface_form(surface_form, foaf_name_predicate, entity_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities processed")