예제 #1
0
def extract_mentions_spans(doc, blacklist=True, debug=False):
    '''
    Extract potential mentions from a spacy parsed Doc
    '''
    if debug: print('===== doc ====:', doc)
    for c in doc:
        if debug:
            print("🚧 span search:", c, "head:", c.head, "tag:", c.tag_,
                  "pos:", c.pos_, "dep:", c.dep_)
    # Named entities
    mentions_spans = list(ent for ent in doc.ents
                          if ent.label_ in ACCEPTED_ENTS)

    if debug:
        print("==-- ents:", list(
            ((ent, ent.label_) for ent in mentions_spans)))
    for spans in parallel_process([{
            'doc': doc,
            'span': sent,
            'blacklist': blacklist
    } for sent in doc.sents],
                                  _extract_from_sent,
                                  use_kwargs=True,
                                  front_num=0):
        mentions_spans = mentions_spans + spans
    spans_set = set()
    cleaned_mentions_spans = []
    for spans in mentions_spans:
        if spans.end > spans.start and (spans.start,
                                        spans.end) not in spans_set:
            cleaned_mentions_spans.append(spans)
            spans_set.add((spans.start, spans.end))

    return cleaned_mentions_spans
예제 #2
0
    def build_and_gather_multiple_arrays(self, save_path):
        print("🌋 Extracting mentions features")
        parallel_process(self.docs, set_feats, n_jobs=self.n_jobs)

        print("🌋 Building and gathering arrays")
        arr = [{'doc': doc, 'i': i} for i, doc in enumerate(self.docs)]
        arrays_dicts = parallel_process(arr,
                                        get_feats,
                                        use_kwargs=True,
                                        n_jobs=self.n_jobs)
        gathering_dict = dict((feat, None) for feat in FEATURES_NAMES)
        n_mentions_list = []
        pairs_ant_index = 0
        pairs_start_index = 0
        for n, p, arrays_dict in tqdm(arrays_dicts):
            for f in FEATURES_NAMES:
                if gathering_dict[f] is None:
                    gathering_dict[f] = arrays_dict[f]
                else:
                    if f == FEATURES_NAMES[6]:
                        array = [a + pairs_ant_index for a in arrays_dict[f]]
                    elif f == FEATURES_NAMES[3]:
                        array = [a + pairs_start_index for a in arrays_dict[f]]
                    else:
                        array = arrays_dict[f]
                    gathering_dict[f] += array
            pairs_ant_index += n
            pairs_start_index += p
            n_mentions_list.append(n)

        for feature in FEATURES_NAMES[:9]:
            print("Building numpy array for", feature, "length",
                  len(gathering_dict[feature]))
            if feature != "mentions_spans":
                array = np.array(gathering_dict[feature])
                if array.ndim == 1:
                    array = np.expand_dims(array, axis=1)
            else:
                array = np.stack(gathering_dict[feature])
            # check_numpy_array(feature, array, n_mentions_list)
            print("Saving numpy", feature, "size", array.shape)
            np.save(save_path + feature, array)
        for feature in FEATURES_NAMES[9:]:
            print("Saving pickle", feature, "size",
                  len(gathering_dict[feature]))
            with open(save_path + feature + '.bin', "wb") as fp:
                pickle.dump(gathering_dict[feature], fp)
예제 #3
0
 def build_key_file(self, data_path, key_file, debug=False):
     print("🌋 Building key file from corpus")
     print("Saving in", key_file)
     # Create a pool of processes. By default, one is created for each CPU in your machine.
     with io.open(key_file, "w", encoding='utf-8') as kf:
         if debug: print("Key file saved in", key_file)
         for dirpath, _, filenames in os.walk(data_path):
             print("In", dirpath)
             file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \
                         or f.endswith(".v4_gold_conll")]
             cleaned_file_list = []
             for f in file_list:
                 fn = f.split('.')
                 if fn[1] == "v4_auto_conll":
                     gold = fn[0] + "." + "v4_gold_conll"
                     if gold not in file_list:
                         cleaned_file_list.append(f)
                 else:
                     cleaned_file_list.append(f)
         #self.load_file(file_list[0])
             doc_list = parallel_process(cleaned_file_list, read_file)
             for doc in doc_list:
                 kf.write(doc)
예제 #4
0
 def read_corpus(self, data_path, debug=False):
     print("🌋 Reading files")
     for dirpath, _, filenames in os.walk(data_path):
         print("In", dirpath, os.path.abspath(dirpath))
         file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \
                     or f.endswith(".v4_gold_conll")]
         cleaned_file_list = []
         for f in file_list:
             fn = f.split('.')
             if fn[1] == "v4_auto_conll":
                 gold = fn[0] + "." + "v4_gold_conll"
                 if gold not in file_list:
                     cleaned_file_list.append(f)
             else:
                 cleaned_file_list.append(f)
         doc_list = parallel_process(cleaned_file_list, load_file)
         for docs in doc_list:  #executor.map(self.load_file, cleaned_file_list):
             for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs:
                 print("Imported", name)
                 if debug:
                     print("utts_text", utts_text)
                     print("utt_tokens", utt_tokens)
                     print("utts_corefs", utts_corefs)
                     print("utts_speakers", utts_speakers)
                     print("name, part", name, part)
                 self.utts_text += utts_text
                 self.utts_tokens += utt_tokens
                 self.utts_corefs += utts_corefs
                 self.utts_speakers += utts_speakers
                 self.utts_doc_idx += [len(self.docs_names)
                                       ] * len(utts_text)
                 self.docs_names.append((name, part))
     print("utts_text size", len(self.utts_text))
     print("utts_tokens size", len(self.utts_tokens))
     print("utts_corefs size", len(self.utts_corefs))
     print("utts_speakers size", len(self.utts_speakers))
     print("utts_doc_idx size", len(self.utts_doc_idx))
     print("🌋 Building docs")
     for name, part in self.docs_names:
         self.docs.append(
             ConllDoc(name=name,
                      part=part,
                      nlp=None,
                      use_no_coref_list=False,
                      consider_speakers=True,
                      embedding_extractor=self.embed_extractor,
                      conll=CONLL_GENRES[name[:2]]))
     print("🌋 Loading spacy model")
     try:
         spacy.info('en_core_web_sm')
         model = 'en_core_web_sm'
     except IOError:
         print("No spacy 2 model detected, using spacy1 'en' model")
         spacy.info('en')
         model = 'en'
     nlp = spacy.load(model)
     print("🌋 Parsing utterances and filling docs")
     doc_iter = (s for s in self.utts_text)
     for utt_tuple in tqdm(
             zip(nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs,
                 self.utts_speakers, self.utts_doc_idx)):
         spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple
         if debug:
             print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens)
         doc = spacy_tokens
         if debug:
             out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \
                       " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id)
             print(out_str.encode('utf-8'))
         self.docs[doc_id].add_conll_utterance(
             doc,
             conll_tokens,
             corefs,
             speaker,
             use_gold_mentions=self.use_gold_mentions)