def build(word2vec_bin, pdtb_dirs): log.info("Loading word2vec model '{}'...".format(word2vec_bin)) model = load_word2vec(word2vec_bin) log.info("Mapping stripped helper vocabulary...") strip_pat_2 = re.compile("_+|-+|/+|\\\\+| +") strip_pat_3a = re.compile("_+") strip_pat_3b = re.compile("[0-9][0-9]+") strip_pat_4a = re.compile("[^A-Za-z0-9#%$\\.,;/]") strip_pat_4b = re.compile("#+") strip_pat_5a = re.compile("[\\.,;/]") strip_pat_5b = re.compile("#+") strip_helpers = [ (lambda text: text.lower()), (lambda text: strip_pat_2.sub("_", text)), (lambda text: strip_pat_3b.sub("##", strip_pat_3a.sub("", text))), (lambda text: strip_pat_4b.sub("#", strip_pat_4a.sub("", text))), (lambda text: strip_pat_5b.sub("#", strip_pat_5a.sub("", text))), ] model_strips = map_strips_base(strip_helpers, model.vocab) log.info("Mapping words/phrases from {}...".format(pdtb_dirs)) it = data_pdtb.PDTBParsesCorpus(pdtb_dirs, with_document=False, with_paragraph=False, with_sentence=True, word_split="-|\\\\/", word_meta=False) vocab, missing, total_cnt = map_sent_base(it, model.vocab, strip_helpers=strip_helpers, strip_vocabs=model_strips, only_longest=False) log.info("- mappings: {}, missing: {}, total words: {}".format(len(vocab), len(missing), total_cnt)) log.info("Mapping vocabulary to word2vec vectors...") map_word2vec = map_base_word2vec(vocab, model) log.info("- words: {}".format(len(map_word2vec))) return map_word2vec, vocab, missing
def train_x_infinite(): while True: train_x_it = data_pdtb.PDTBParsesCorpus(args.train_dir, with_document=True, with_paragraph=False, with_sentence=False, word_split="-|\\\\/", word_meta=True) for doc in train_x_it: yield doc
def load_words(pdtb_dir, relations): """Load PDTB words by document id. Example output: words[doc_id][0] = { 'Text': "Kemper", 'DocID': doc_id, 'ParagraphID': 0, 'SentenceID': 0, 'SentenceToken': 0, 'TokenList': [0], 'PartOfSpeech': "NNP", 'Linkers': ["arg1_14890"], 'Tags': {"Explicit:Expansion.Conjunction:4:Arg1": 1}, } """ lpart_to_rpart = {"arg1": "Arg1", "arg2": "Arg2", "conn": "Connective"} words_it = data_pdtb.PDTBParsesCorpus(pdtb_dir, with_document=True, with_paragraph=False, with_sentence=False, word_split="-|\\\\/", word_meta=True) words = {} for doc in words_it: doc_id = doc[0]['DocID'] # store by document id words[doc_id] = doc # add relation tags to each word for word in words[doc_id]: word['Tags'] = {} for linker in word['Linkers']: # get relation ids for each word lpart, rid = linker.split("_") rpart = lpart_to_rpart[lpart] # find by relation id for relation in relations[doc_id]: if rid == str(relation['ID']): # relation found tag = relation_to_tag(relation, rpart) try: word['Tags'][tag] += 1 except KeyError: word['Tags'][tag] = 1 break # only one return words