def _load(self): # load vocab file self.vocab = OrderedDict() with open(self.vocab_file, encoding='utf-8') as vf: for line in vf.readlines(): line = line.strip() self.vocab[line] = len(self.vocab) # filter glove if self.pretrained_file != '' and os.path.exists(self.pretrained_file): glove_emb = {} with open(self.pretrained_file, 'r', encoding='utf-8') as pf: for line in pf.readlines(): sp = line.split(' ') if sp[0].lower() in self.vocab: glove_emb[sp[0].lower()] = np.array([float(x) for x in sp[1:]]) files = ['{}.txt'.format(self.mode)] corpus = BracketParseCorpusReader('{}/sst'.format(self.dir), files) sents = corpus.parsed_sents(files[0]) #initialize with glove pretrained_emb = [] fail_cnt = 0 for line in self.vocab.keys(): if self.pretrained_file != '' and os.path.exists(self.pretrained_file): if not line.lower() in glove_emb: fail_cnt += 1 pretrained_emb.append(glove_emb.get(line.lower(), np.random.uniform(-0.05, 0.05, 300))) if self.pretrained_file != '' and os.path.exists(self.pretrained_file): self.pretrained_emb = F.tensor(np.stack(pretrained_emb, 0)) print('Miss word in GloVe {0:.4f}'.format(1.0*fail_cnt/len(self.pretrained_emb))) # build trees for sent in sents: self.trees.append(self._build_tree(sent))
def make_dataset(self, corpus: str) -> Dataset: reader = BracketParseCorpusReader(*os.path.split(corpus), encoding=self.encoding, detect_blocks='sexpr') oracles = [DiscOracle.from_tree(t) for t in reader.parsed_sents()] examples = [make_example(x, self.fields) for x in oracles] return Dataset(examples, self.fields)
def get_stats_from_snli_dataset(files, tagset=("NN", "NNS"), use_lemmas=False): lemmatizer = None if use_lemmas: lemmatizer = WordNetLemmatizer() stats = dd(int) num_of_token = 0 for filename in files: f = NamedTemporaryFile() fields_to_read = {"sentence1_parse", "sentence2_parse"} for sent in open(filename): sent = ujson.loads(sent) for field in fields_to_read: f.write("%s\n" % sent[field]) reader = BracketParseCorpusReader("/tmp", os.path.basename(f.name)) for word, tag in reader.tagged_words(): if tagset is None or tag in tagset: if use_lemmas: word = lemmatizer.lemmatize(word, pos=tag.lower()[0]) stats[word] += 1 num_of_token += 1 return stats, num_of_token
def reader(corpus_dir): """ An instantiated NLTK BracketedParseCorpusReader for the PTB section relevant to the PDTB corpus. Note that the path you give to this will probably end with something like `parsed/mrg/wsj` """ return BracketParseCorpusReader(corpus_dir, r'../wsj_.*\.mrg', encoding='ascii')
def text2DGL(source_file, vocab_file, embed_file, word_dim): # vocab(stoi): {word : index} vocab = OrderedDict() with open(vocab_file, encoding='utf-8') as vf: for line in vf.readlines(): line = line.strip() vocab[line] = len(vocab) # enrich word embedding embedding = np.random.random((len(vocab), word_dim)) with open(embed_file, 'r', encoding='utf-8') as pf: for line in pf.readlines(): sp = line.split(' ') if sp[0].lower() in vocab: embedding[vocab[sp[0].lower()]] = np.array( [float(x) for x in sp[1:]]) # build dgl from file files = [source_file] corpus = BracketParseCorpusReader('{}'.format(""), files) sents = corpus.parsed_sents(files[0]) trees = [build_tree(sent, vocab) for sent in sents] return trees, embedding, vocab
#usage: hw4_topcfg.sh <treebank_filename> <output_PCFG_file> if __name__ == "__main__": PATH_TRAIN = sys.argv[1] out = sys.argv[2] match = re.search("(?s:.*)/", PATH_TRAIN) if match: DIR_TRAIN = re.search("(?s:.*)/", PATH_TRAIN).group(0) else: DIR_TRAIN = os.getcwd() # read in parsed corpus with open(PATH_TRAIN) as f: data = f.read() parsed_data = BracketParseCorpusReader(DIR_TRAIN, 'parses.train').parsed_sents() # get counts of all non-terminals counts_nodes = Counter(re.findall("\(([A-Z_]+) ", data)) # get counts of all rules list_counts_rules = [] for sent in parsed_data: traverse_tree(sent, list_counts_rules) counts_rules = Counter(list_counts_rules) prob_rules = dict() for rule in counts_rules: node = re.findall("([A-Z_]+)", rule)[0] prob_rules[rule] = counts_rules[rule] / counts_nodes[node]
# WIP # dirty, almost copies from educe.rst_dt.ptb.PtbParser... # TODO go and fix educe.rst_dt.{ptb, corenlp} PTB_DIR = os.path.join( os.path.dirname(__file__), '..', '..', 'data', # alt: '..', '..', 'corpora', 'PTBIII', 'parsed', 'mrg', 'wsj') # FIXME this fails when PTB_DIR does not exist ; # I need to find a clean way to address this PTB_READER = BracketParseCorpusReader(PTB_DIR, r'../wsj_.*\.mrg', encoding='ascii') def tokenize_doc_ptb(doc_id, doc_text): """Dirty PTB tokenizer""" ptb_name = _guess_ptb_name(doc_id) if ptb_name is None: return None # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc_text tagged_tokens = PTB_READER.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes
def __init__(self, corpus_dir): """ """ self.reader = BracketParseCorpusReader(corpus_dir, r'../wsj_.*\.mrg', encoding='ascii')
def __init__(self, corpus_file: str, lowercase: bool = True) -> None: self.corpus_file = corpus_file self.lowercase = lowercase self._reader = BracketParseCorpusReader(*os.path.split(corpus_file))