def __call__(self, node_dict, node_tree): """Detects parts of noun-phrases via CPL""" noun_parts = {} # node => list of parts have_parent = set() for part in [n for n in node_dict.values() if n.tag in PROP_TAGS]: dep_parent = node_tree.parent(part) if dep_parent is not None: dep, parent = dep_parent if all([ dep == Dep.CPL, not node_tree.has_child_via_set(part, MARKER_DEPS), part.idx < parent.idx, parent.tag in NOUN_TAGS ]): append_to_dict_list(noun_parts, parent, part) have_parent.add(part) # Register chunks flattened = set() for root in [ n for n in node_dict.values() if n in noun_parts and n not in have_parent ]: flattened |= _chunk(root, noun_parts, self.__form_eids) # Clean nodes return { idx: node for idx, node in node_dict.items() if node not in flattened }
def __init__(self, ngrams, last_order, nb_processes): ngrams_dict = {} # order => list of ngrams self.__total_counts = {order: 0 for order in range(1, last_order + 1)} # order => total_count for ngram in ngrams: self.__total_counts[ngram.order] += ngram.count append_to_dict_list(ngrams_dict, ngram.order, ngram) super().__init__(ngrams_dict, last_order, nb_processes)
def __init__(self): """Pass the min count for each order needed. Examples: * If only unigrams are needed with min count 1, instantiate LanguageModel(1) * To get unigrams with mincount 10 and bigrams with mincount 5, instantiate LanguageModel(10, 5) """ self.__max_order = len(self.__min_counts) self.__ngrams_details = {} # ngram => tuple (count, proba, logp) self.__unk = (0, 0.0, float('-inf') ) # Unknown ngram (count=0, proba=0.0, logp=-inf) self.__tokenize = Tokenizer() # Load ngrams for idx, min_count in enumerate(self.__min_counts): order = idx + 1 for text, count, proba in load_pkl_file( cfg.DATA_DIR / 'langmodel' / ('%d_grams.pkl' % order)): if count >= min_count: self.__ngrams_details[text] = (count, proba, math.log(proba)) # Build next_token dict self.__next_tokens = { } # ngram => list of tuples (token, proba) ordered by descending proba if self.__max_order > 1: for ngram, details in self.__ngrams_details.items(): tokens = ngram.split() if len(tokens) > 1: append_to_dict_list(self.__next_tokens, ' '.join(tokens[:-1]), (tokens[-1], details[1])) for ngram, next_tokens in self.__next_tokens.items(): next_tokens.sort(key=lambda x: x[1], reverse=True)
def _prepare_mkn(ngrams, last_order): """Calculate all counts necessary prior to smoothing""" lm_ngrams = {} # order => list of ngrams wcs = collections.Counter() wc1 = collections.Counter() wc2 = collections.Counter() wc3 = collections.Counter() n1 = collections.Counter() # Number of ngrams with count 1 n2 = collections.Counter() # Number of ngrams with count 2 n3 = collections.Counter() # Number of ngrams with count 3 n4 = collections.Counter() # Number of ngrams with count 4 for ngram in ngrams: # n counts if ngram.count == 1: n1[ngram.order] += 1 elif ngram.count == 2: n2[ngram.order] += 1 elif ngram.count == 3: n3[ngram.order] += 1 elif ngram.count == 4: n4[ngram.order] += 1 # Wildcards if ngram.order > 1: wcs['%s %s' % ('•', ' '.join(ngram.tokens[1:]))] += 1 # • ngram wc = '%s %s' % (' '.join(ngram.tokens[:-1]), '•') # ngram • if ngram.count == 1: wc1[wc] += 1 elif ngram.count == 2: wc2[wc] += 1 else: wc3[wc] += 1 if ngram.order > 2: wcs['%s %s %s' % ('•', ' '.join(ngram.tokens[1:-1]), '•')] += 1 # • ngram • append_to_dict_list(lm_ngrams, ngram.order, ngram) # Calculage discounting values d1 = {} d2 = {} d3 = {} for order in range(2, last_order + 1): if any( [n1[order] == 0, n2[order] == 0, n3[order] == 0, n4[order] == 0]): raise ModifiedKneserNeyNotEnoughDataError() else: y = n1[order] / (n1[order] + 2. * n2[order]) d1[order] = 1. - (2. * y * n2[order] / n1[order]) d2[order] = 2. - (3. * y * n3[order] / n2[order]) d3[order] = 3. - (4. * y * n4[order] / n3[order]) return lm_ngrams, wcs, wc1, wc2, wc3, d1, d2, d3
def extract_classes(resources_dir): """Extracts entity's classes from knowledge base""" reports_dir = cfg.REPORTS_DIR / 'extract_classes' timer = Timer() print() # EXTRACT URIs & THEIR ENTITY ID # --------------------------------------------------------------------------> print('Extracting entity IDs...') re_wikidata_uri = re.compile(r'<http://www.wikidata.org/entity/Q(\d+)>') with txt_file_reader(pathlib.Path(resources_dir) / 'yago-wd-sameAs.nt') as data: yago_eid = {} # yago_uri => entity id (numerical string) for line in data: yago_uri, _, same_as, _ = line.strip().split('\t') match = re.fullmatch(re_wikidata_uri, same_as) if match: yago_eid[yago_uri] = match.group(1) with txt_file_writer(reports_dir / 'uri_eid.tx') as report: for yago_uri, eid in yago_eid.items(): report.write('%s\t%s\n' % (yago_uri, eid)) print('{:,} URIs extracted in {}'.format(len(yago_eid), timer.lap_time)) print() # GENERATE CLASS IDs # --------------------------------------------------------------------------> print('Extracting class IDs...') with txt_file_reader(pathlib.Path(resources_dir) / 'yago-wd-schema.nt') as data: uri_cid = {} # schema URI => class ID class_id = 1 for line in data: class_uri, _, _, _ = line.strip().split('\t') uri = class_uri[1:-1] if uri not in uri_cid and uri.startswith('http'): uri_cid[uri] = class_id class_id += 1 with txt_file_writer(reports_dir / 'uri_cid.txt') as report: for uri, cid in uri_cid.items(): report.write('%s\t%d\n' % (uri, cid)) print('Extracted {:,} classes in {}'.format(len(uri_cid), timer.lap_time)) print() # EXTRACT INSTANCE_OF RELATIONS # --------------------------------------------------------------------------> print('Extracting types...') with txt_file_reader( pathlib.Path(resources_dir) / 'yago-wd-simple-types.nt') as data: eid_cids = {} # entity id => list of class ids for line in data: yago_uri, _, instance_of, _ = line.strip().split('\t') eid = yago_eid.get(yago_uri, None) if eid is not None: append_to_dict_list(eid_cids, int(eid), uri_cid[instance_of[1:-1]]) with txt_file_writer(reports_dir / 'eid_cids.txt') as report: for eid, types in eid_cids.items(): report.write('%d\t%s\n' % (eid, str(types))) print('{:,} entities assigned in {}'.format(len(eid_cids), timer.lap_time)) print() # ALL DONE # --------------------------------------------------------------------------> print('All done in {}'.format(timer.total_time))
def register_warning(self, token_idx, comment, focus_idxs): warning = {'comment': comment, 'focusIdxs': focus_idxs} append_to_dict_list(self.__warnings, token_idx, warning)