示例#1
0
def build(word2vec_bin, pdtb_dirs):
    log.info("Loading word2vec model '{}'...".format(word2vec_bin))
    model = load_word2vec(word2vec_bin)

    log.info("Mapping stripped helper vocabulary...")
    strip_pat_2 = re.compile("_+|-+|/+|\\\\+| +")
    strip_pat_3a = re.compile("_+")
    strip_pat_3b = re.compile("[0-9][0-9]+")
    strip_pat_4a = re.compile("[^A-Za-z0-9#%$\\.,;/]")
    strip_pat_4b = re.compile("#+")
    strip_pat_5a = re.compile("[\\.,;/]")
    strip_pat_5b = re.compile("#+")
    strip_helpers = [
        (lambda text: text.lower()),
        (lambda text: strip_pat_2.sub("_", text)),
        (lambda text: strip_pat_3b.sub("##", strip_pat_3a.sub("", text))),
        (lambda text: strip_pat_4b.sub("#", strip_pat_4a.sub("", text))),
        (lambda text: strip_pat_5b.sub("#", strip_pat_5a.sub("", text))),
    ]
    model_strips = map_strips_base(strip_helpers, model.vocab)

    log.info("Mapping words/phrases from {}...".format(pdtb_dirs))
    it = data_pdtb.PDTBParsesCorpus(pdtb_dirs, with_document=False, with_paragraph=False, with_sentence=True, word_split="-|\\\\/", word_meta=False)
    vocab, missing, total_cnt = map_sent_base(it, model.vocab, strip_helpers=strip_helpers, strip_vocabs=model_strips, only_longest=False)
    log.info("- mappings: {}, missing: {}, total words: {}".format(len(vocab), len(missing), total_cnt))

    log.info("Mapping vocabulary to word2vec vectors...")
    map_word2vec = map_base_word2vec(vocab, model)
    log.info("- words: {}".format(len(map_word2vec)))

    return map_word2vec, vocab, missing
示例#2
0
 def train_x_infinite():
     while True:
         train_x_it = data_pdtb.PDTBParsesCorpus(args.train_dir,
                                                 with_document=True,
                                                 with_paragraph=False,
                                                 with_sentence=False,
                                                 word_split="-|\\\\/",
                                                 word_meta=True)
         for doc in train_x_it:
             yield doc
def load_words(pdtb_dir, relations):
    """Load PDTB words by document id.

    Example output:

        words[doc_id][0] = {
            'Text': "Kemper",
            'DocID': doc_id,
            'ParagraphID': 0,
            'SentenceID': 0,
            'SentenceToken': 0,
            'TokenList': [0],
            'PartOfSpeech': "NNP",
            'Linkers': ["arg1_14890"],
            'Tags': {"Explicit:Expansion.Conjunction:4:Arg1": 1},
        }
    """

    lpart_to_rpart = {"arg1": "Arg1", "arg2": "Arg2", "conn": "Connective"}
    words_it = data_pdtb.PDTBParsesCorpus(pdtb_dir,
                                          with_document=True,
                                          with_paragraph=False,
                                          with_sentence=False,
                                          word_split="-|\\\\/",
                                          word_meta=True)

    words = {}
    for doc in words_it:
        doc_id = doc[0]['DocID']

        # store by document id
        words[doc_id] = doc

        # add relation tags to each word
        for word in words[doc_id]:
            word['Tags'] = {}
            for linker in word['Linkers']:  # get relation ids for each word
                lpart, rid = linker.split("_")
                rpart = lpart_to_rpart[lpart]

                # find by relation id
                for relation in relations[doc_id]:
                    if rid == str(relation['ID']):  # relation found
                        tag = relation_to_tag(relation, rpart)
                        try:
                            word['Tags'][tag] += 1
                        except KeyError:
                            word['Tags'][tag] = 1
                        break  # only one
    return words