示例#1
0
    def build_and_gather_multiple_arrays(self, save_path):
        print("🌋 Extracting mentions features with {} job(s)".format(
            self.n_jobs))
        parallel_process(self.docs, set_feats, n_jobs=self.n_jobs)

        print("🌋 Building and gathering array with {} job(s)".format(
            self.n_jobs))
        arr = [{'doc': doc, 'i': i} for i, doc in enumerate(self.docs)]
        arrays_dicts = parallel_process(arr,
                                        get_feats,
                                        use_kwargs=True,
                                        n_jobs=self.n_jobs)
        gathering_dict = dict((feat, None) for feat in FEATURES_NAMES)
        n_mentions_list = []
        pairs_ant_index = 0
        pairs_start_index = 0
        for npaidx in tqdm(range(len(arrays_dicts))):
            try:
                n, p, arrays_dict = arrays_dicts[npaidx]
            except:
                # empty array dict, cannot extract the dict values for this doc
                continue

            for f in FEATURES_NAMES:
                if gathering_dict[f] is None:
                    gathering_dict[f] = arrays_dict[f]
                else:
                    if f == FEATURES_NAMES[6]:
                        array = [a + pairs_ant_index for a in arrays_dict[f]]
                    elif f == FEATURES_NAMES[3]:
                        array = [a + pairs_start_index for a in arrays_dict[f]]
                    else:
                        array = arrays_dict[f]
                    gathering_dict[f] += array
            pairs_ant_index += n
            pairs_start_index += p
            n_mentions_list.append(n)

        for feature in FEATURES_NAMES[:9]:
            feature_data = gathering_dict[feature]
            if not feature_data:
                print("No data for", feature)
                continue
            print("Building numpy array for", feature, "length",
                  len(feature_data))
            if feature != "mentions_spans":
                array = np.array(feature_data)
                if array.ndim == 1:
                    array = np.expand_dims(array, axis=1)
            else:
                array = np.stack(feature_data)
            # check_numpy_array(feature, array, n_mentions_list)
            print("Saving numpy", feature, "size", array.shape)
            np.save(save_path + feature, array)
        for feature in FEATURES_NAMES[9:]:
            feature_data = gathering_dict[feature]
            if feature_data:
                print("Saving pickle", feature, "size", len(feature_data))
                with open(save_path + feature + '.bin', "wb") as fp:
                    pickle.dump(feature_data, fp)
示例#2
0
def extract_mentions_spans(doc, blacklist=True, debug=False):
    '''
    Extract potential mentions from a spacy parsed Doc
    '''
    if debug: print('===== doc ====:', doc)
    if debug:
        tablines = []
        print("🚧 span search:")
        for c in doc:
            tablines.append(["🚧", c, c.head, c.tag_, c.pos_, c.dep_])
            # if debug: print("🚧 span search:", c, "head:", c.head, "tag:", c.tag_, "pos:", c.pos_, "dep:", c.dep_)
        print(tabulate(tablines, headers=["", "Token", "Head", "Tag", "Pos", "Dep"]))
    # Named entities
    mentions_spans = list(ent for ent in doc.ents if ent.label_ in ACCEPTED_ENTS)

    if debug: print("==-- ents:", list(((ent, ent.label_) for ent in mentions_spans)))
    for spans in parallel_process([{'doc': doc,
                                    'span': sent,
                                    'blacklist': blacklist} for sent in doc.sents],
                                _extract_from_sent, use_kwargs=True, front_num=0):
        mentions_spans = mentions_spans + spans
    spans_set = set()
    cleaned_mentions_spans = []
    for spans in mentions_spans:
        if spans.end > spans.start and (spans.start, spans.end) not in spans_set:
            cleaned_mentions_spans.append(spans)
            spans_set.add((spans.start, spans.end))

    return cleaned_mentions_spans
示例#3
0
 def build_key_file(self, data_path, key_file, debug=False):
     print("🌋 Building key file from corpus")
     print("Saving in", key_file)
     # Create a pool of processes. By default, one is created for each CPU in your machine.
     with io.open(key_file, "w", encoding="utf-8") as kf:
         if debug:
             print("Key file saved in", key_file)
         for dirpath, _, filenames in os.walk(data_path):
             print("In", dirpath)
             file_list = [
                 os.path.join(dirpath, f) for f in filenames
                 if f.endswith(".v4_auto_conll")
                 or f.endswith(".v4_gold_conll")
             ]
             cleaned_file_list = []
             for f in file_list:
                 fn = f.split(".")
                 if fn[1] == "v4_auto_conll":
                     gold = fn[0] + "." + "v4_gold_conll"
                     if gold not in file_list:
                         cleaned_file_list.append(f)
                 else:
                     cleaned_file_list.append(f)
             # self.load_file(file_list[0])
             doc_list = parallel_process(cleaned_file_list, read_file)
             for doc in doc_list:
                 kf.write(doc)
示例#4
0
def extract_mentions_spans(doc, blacklist, debug=False):
    """
    Extract potential mentions from a spacy parsed Doc
    """
    if debug:
        print("===== doc ====:", doc)
    for c in doc:
        if debug:
            print(
                "🚧 span search:",
                c,
                "head:",
                c.head,
                "tag:",
                c.tag_,
                "pos:",
                c.pos_,
                "dep:",
                c.dep_,
            )
    # Named entities
    mentions_spans = list(ent for ent in doc.ents
                          if ent.label_ in ACCEPTED_ENTS)

    if debug:
        print("==-- ents:", list(
            ((ent, ent.label_) for ent in mentions_spans)))
    for spans in parallel_process(
        [{
            "doc": doc,
            "span": sent,
            "blacklist": blacklist
        } for sent in doc.sents],
            _extract_from_sent,
            use_kwargs=True,
            front_num=0,
    ):
        mentions_spans = mentions_spans + spans
    spans_set = set()
    cleaned_mentions_spans = []
    for spans in mentions_spans:
        if spans.end > spans.start and (spans.start,
                                        spans.end) not in spans_set:
            cleaned_mentions_spans.append(spans)
            spans_set.add((spans.start, spans.end))

    return cleaned_mentions_spans
示例#5
0
    def read_corpus(self, data_path, model=None, debug=False):
        print("🌋 Reading files")
        for dirpath, _, filenames in os.walk(data_path):
            print("In", dirpath, os.path.abspath(dirpath))
            file_list = [
                os.path.join(dirpath, f) for f in filenames
                if f.endswith(".v4_auto_conll") or f.endswith(".v4_gold_conll")
            ]
            cleaned_file_list = []
            for f in file_list:
                fn = f.split(".")
                if fn[1] == "v4_auto_conll":
                    gold = fn[0] + "." + "v4_gold_conll"
                    if gold not in file_list:
                        cleaned_file_list.append(f)
                else:
                    cleaned_file_list.append(f)
            doc_list = parallel_process(cleaned_file_list, load_file)
            for docs in doc_list:  # executor.map(self.load_file, cleaned_file_list):
                for (
                        utts_text,
                        utt_tokens,
                        utts_corefs,
                        utts_speakers,
                        name,
                        part,
                ) in docs:
                    if debug:
                        print("Imported", name)
                        print("utts_text", utts_text)
                        print("utt_tokens", utt_tokens)
                        print("utts_corefs", utts_corefs)
                        print("utts_speakers", utts_speakers)
                        print("name, part", name, part)
                    self.utts_text += utts_text
                    self.utts_tokens += utt_tokens
                    self.utts_corefs += utts_corefs
                    self.utts_speakers += utts_speakers
                    self.utts_doc_idx += [len(self.docs_names)
                                          ] * len(utts_text)
                    self.docs_names.append((name, part))
        print("utts_text size", len(self.utts_text))
        print("utts_tokens size", len(self.utts_tokens))
        print("utts_corefs size", len(self.utts_corefs))
        print("utts_speakers size", len(self.utts_speakers))
        print("utts_doc_idx size", len(self.utts_doc_idx))
        print("🌋 Building docs")
        for name, part in self.docs_names:
            self.docs.append(
                ConllDoc(
                    name=name,
                    part=part,
                    nlp=None,
                    blacklist=self.blacklist,
                    consider_speakers=True,
                    embedding_extractor=self.embed_extractor,
                    conll=CONLL_GENRES[name[:2]],
                ))
        print("🌋 Loading spacy model")

        if model is None:
            model_options = [
                "en_core_web_lg", "en_core_web_md", "en_core_web_sm", "en"
            ]
            for model_option in model_options:
                if not model:
                    try:
                        spacy.info(model_option)
                        model = model_option
                        print("Loading model", model_option)
                    except:
                        print("Could not detect model", model_option)
            if not model:
                print("Could not detect any suitable English model")
                return
        else:
            spacy.info(model)
            print("Loading model", model)
        nlp = spacy.load(model)
        print("🌋 Parsing utterances and filling docs with use_gold_mentions=" +
              (str(bool(self.gold_mentions))))
        doc_iter = (s for s in self.utts_text)
        for utt_tuple in tqdm(
                zip(
                    nlp.pipe(doc_iter),
                    self.utts_tokens,
                    self.utts_corefs,
                    self.utts_speakers,
                    self.utts_doc_idx,
                )):
            spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple
            if debug:
                print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens)
            doc = spacy_tokens
            if debug:
                out_str = ("utterance " + unicode_(doc) + " corefs " +
                           unicode_(corefs) + " speaker " + unicode_(speaker) +
                           "doc_id" + unicode_(doc_id))
                print(out_str.encode("utf-8"))
            self.docs[doc_id].add_conll_utterance(
                doc,
                conll_tokens,
                corefs,
                speaker,
                use_gold_mentions=self.gold_mentions)