def get_tokens(fname, stopwords): with open(fname, encoding='utf-8') as f: text = f.read() text = re.sub(r'\d', '9', text) # word_re = re.compile(r'(\p{L}[\p{L}_-]+|\p{P}+)') word_re = re.compile(r'(\p{L}[\p{L}_-]*|\p{N}+)') tokens = word_re.findall(text) # retain = set(['NOUN', 'ADJ', 'ADV', 'PROPN']) retain = set(['NOUN', 'ADJ']) # pos_tagged = pos_tag(tokens) # print "pos_tags:", " ".join("{}_{}".format(*t) for t in pos_tagged) tokens = [tok for tok, tag in pos_tag(tokens) if map_tag('en-ptb', 'universal', tag) in retain] tokens = [tok.lower() for tok in tokens] tokens = [tok for tok in tokens if tok not in stopwords] stemmer = PorterStemmer() tokens = [stemmer.stem(tok) for tok in tokens] # print "=====" # print "tokens:", " ".join(tokens) # print "=====" return tokens
def _pos_tag(tokens, tagset=None, tagger=None, lang=None): # Currently only supoorts English and Russian. if lang not in ['eng', 'rus']: raise NotImplementedError( "Currently, NLTK pos_tag only supports English and Russian " "(i.e. lang='eng' or lang='rus')") else: tagged_tokens = tagger.tag(tokens) if tagset: # Maps to the specified tagset. if lang == 'eng': tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens] elif lang == 'rus': # Note that the new Russion pos tags from the model contains suffixes, # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018 tagged_tokens = [(token, map_tag('ru-rnc-new', tagset, tag.partition('=')[0])) for (token, tag) in tagged_tokens] return tagged_tokens
def create_corpus(tagged): """ Take a list of tagged words and return a corpus as a list of tagged words with universal tagging and filtering all non-word entries """ corpus = [] for pair in tagged: if match(r'[a-zA-Z0-9_-]+', pair[0]): try: corpus.append(Word(pair[0], map_tag('en-ptb', 'universal', pair[1]))) except KeyError: print 'Part mismatch:', pair[0], pair[1] return corpus
def compute_features(self, s, count): # preprocess tok_sent = nltk.tokenize.word_tokenize(s) stop_tok_sent = [x for x in tok_sent if x not in cachedStopWords] # location features P = 1.0/count F5 = 1 if count <=5 else 0 LEN = len(stop_tok_sent)/30.0 # language modelling LM = LModel.score(s) # pos tagging features tag_fd = FreqDist(map_tag("en-ptb", "universal",tag) if map_tag("en-ptb", "universal",tag) not in cachedStopPOStags else "OTHER" for (word, tag) in pos_tagger(tok_sent)) NN = tag_fd.freq("NOUN") VB = tag_fd.freq("VERB") # headline-sentence similarity VS1 = 1 - spatial.distance.cosine(self.hl_vsv_1.toarray(), self.father.cv.transform([s]).toarray()) TFIDF = 1 - spatial.distance.cosine(self.hl_tfidf.toarray(), self.father.tv.transform([s]).toarray()) # topic description-sentence similarity CT = 1 - spatial.distance.cosine(self.father.desc_vsv.toarray(), self.father.cv.transform([s]).toarray()) Q = 1 - spatial.distance.cosine(self.father.title_vsv.toarray(), self.father.cv.transform([s]).toarray()) # security checks if math.isnan(VS1): VS1 = 0 print self.father.code, self.id if math.isnan(CT): CT = 0 print self.father.code, self.id if math.isnan(Q): Q = 0 print self.father.code, self.id # active features return np.asarray([P, F5, LEN, LM, VS1, TFIDF, VB, NN, CT, Q])
def tagstr2tree(s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_label: The label to use for chunk nodes :type chunk_label: str :param root_label: The label to use for the root of the tree :type root_label: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+") stack = [Tree(root_label, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == "[": if len(stack) != 1: raise ValueError("Unexpected [ at char {:d}".format( match.start())) chunk = Tree(chunk_label, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == "]": if len(stack) != 2: raise ValueError("Unexpected ] at char {:d}".format( match.start())) stack.pop() else: if sep is None: stack[-1].append(text) else: word, tag = str2tuple(text, sep) if source_tagset and target_tagset: tag = map_tag(source_tagset, target_tagset, tag) stack[-1].append((word, tag)) if len(stack) != 1: raise ValueError("Expected ] at char {:d}".format(len(s))) return stack[0]
def extract_reverb_patterns(text): text_tokens = word_tokenize(text) tags_ptb = pos_tag(text_tokens) tags = [] for t in tags_ptb: tag = map_tag('en-ptb', 'universal', t[1]) tags.append((t[0], tag)) patterns = [] patterns_tags = [] i = 0 limit = len(tags)-1 while i <= limit: tmp = io.StringIO() tmp_tags = [] # a ReVerb pattern always starts with a verb if tags[i][1] == 'VERB': tmp.write(tags[i][0]+' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # V = verb particle? adv? (also capture auxiliary verbs) while i <= limit and tags[i][1] in ['VERB', 'PRT', 'ADV']: tmp.write(tags[i][0]+' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # W = (noun | adj | adv | pron | det) while i <= limit and tags[i][1] in ['NOUN', 'ADJ', 'ADV', 'PRON', 'DET']: tmp.write(tags[i][0]+' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # P = (prep | particle | inf. marker) while i <= limit and tags[i][1] in ['ADP', 'PRT']: tmp.write(tags[i][0]+' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # add the build pattern to the list collected patterns patterns.append(tmp.getvalue()) patterns_tags.append(tmp_tags) i += 1 return patterns, patterns_tags
def parse(self, orig_tokens): if orig_tokens and type(orig_tokens[0]) is tuple: tokens = [token for token, _ in orig_tokens] else: tokens = orig_tokens tokenized_ud = list( map(lambda x: (x[0], map_tag('ru-rnc', 'universal', x[1])), pos_tag(tokens, lang='rus'))) tokenized_nltk = pos_tag(tokens, lang='rus') tokenized_mystem = [(token, self.mystem_tagger.tag_word(token)[0][1]) for token in tokens] # print(self.chunker_iis.parse(tokenized_ud)) tags_nltk = self.chunker_nltk.parse(tokenized_nltk, return_tree=False) tags_ud = self.chunker_nltk.parse(tokenized_ud, return_tree=False) tags_mystem = self.chunker_nltk.parse(tokenized_mystem, return_tree=False) tags_iis = tree2conlltags(self.chunker_iis.parse(tokenized_ud)) tags_grammar = tree2conlltags( self.grammar_chunker.parse(tokenized_mystem)) result_tags = [tags_nltk, tags_ud, tags_mystem, tags_grammar, tags_iis] if tokens is orig_tokens: tag_source = tags_ud else: tag_source = orig_tokens tags = [(token, tag_source[ind][1], pick_tag([tags_sp[ind][2] for tags_sp in result_tags], tags_ud[ind][1])) for ind, token in enumerate(tokens)] # for ind, (token,pos,iob_tag) in enumerate(tags): # if token in set(['таких', 'такие', 'такими', 'как', 'включая', 'и', 'или','другие', 'других', 'другими', 'особенно', 'в', 'частности', ',']): # tags[ind] = (token, pos, 'O') for ind, (token, pos, iob_tag) in enumerate(tags): if ind == 0: continue if iob_tag == "B-NP*": if tags[ind - 1][2] in {'B-NP', 'I-NP'}: tags[ind] = (token, pos, 'I-NP') else: tags[ind] = (token, pos, 'B-NP') if iob_tag == "I-NP" and tags[ind - 1][2] not in {'B-NP', 'I-NP'}: tags[ind] = (token, pos, 'B-NP') return conlltags2tree(tags)
def _pos_tag(tokens, tagset=None, tagger=None, lang=None): # Currently only supoorts English and Russian. if lang not in ['eng', 'rus']: raise NotImplementedError( "Currently, NLTK pos_tag only supports English and Russian " "(i.e. lang='eng' or lang='rus')" ) else: tagged_tokens = tagger.tag(tokens) if tagset: # Maps to the specified tagset. if lang == 'eng': tagged_tokens = [ (token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens ] elif lang == 'rus': # Note that the new Russion pos tags from the model contains suffixes, # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018 tagged_tokens = [ (token, map_tag('ru-rnc-new', tagset, tag.partition('=')[0])) for (token, tag) in tagged_tokens ] return tagged_tokens
def _pos_tag(tokens, tagset=None, tagger=None, lang=None): # Currently only supports English and Russian. if lang not in ["eng", "rus"]: raise NotImplementedError( "Currently, NLTK pos_tag only supports English and Russian " "(i.e. lang='eng' or lang='rus')") # Throws Error if tokens is of string type elif isinstance(tokens, str): raise TypeError("tokens: expected a list of strings, got a string") else: tagged_tokens = tagger.tag(tokens) if tagset: # Maps to the specified tagset. if lang == "eng": tagged_tokens = [(token, map_tag("en-ptb", tagset, tag)) for (token, tag) in tagged_tokens] elif lang == "rus": # Note that the new Russian pos tags from the model contains suffixes, # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018 tagged_tokens = [(token, map_tag("ru-rnc-new", tagset, tag.partition("=")[0])) for (token, tag) in tagged_tokens] return tagged_tokens
def featurize(line, idnum, pat): word_tag = hpt.tag(line.split()) # tag all sentence last = len(word_tag) - 1 past_nuc = '' past_kind = '' filename = pat+'/'+str(idnum) with open(str(filename), 'a') as file: #open json file for (i,(word,tag)) in enumerate(word_tag): current_word = [] current_word.append(('word',word.lower())) current_word.append(('tag',tag)) current_word.append(('collps_tag',map_tag(source='en-ptb',target='universal', source_tag=tag))) # source = 'en-ptb' is for wsj [TBD no wsj?] current_word.append(('function',bool(word in function))) current_word.append(('negation',bool(word in negation))) try: phns = d[word.lower()][0] except: f.write(word) f.write('\n') #find if it is a number or dot or pound continue sylls, num_syll, nuc, kind = syll_detector(phns) current_word.append(('sylls', sylls)) current_word.append(('num_sylls', num_syll)) current_word.append(('nuc', nuc)) current_word.append(('nuc_kind', kind)) if i > 0: # if not first word past_word.append(('right_nuc',nuc)) past_word.append(('right_nuc_kind',kind)) json.dump(OrderedDict(past_word), file, indent=4) #copy past_word to json current_word.append(('left_nuc',past_nuc)) current_word.append(('left_nuc_kind',past_kind)) past_nuc = nuc # after updating current_word, current nuc becomes past_nuc past_kind = kind if i == 0: current_word.append(('left_nuc','None')) if i == last: current_word.append(('right_nuc','None')) json.dump(OrderedDict(current_word), file, indent=4) #copy final_word to jason past_word = copy.deepcopy(current_word) # keep past dict
def test_reverb_patterns_extraction(sentences): for line in fileinput.input(sentences): #s = line.split('sentence:')[1].strip() text_tokens = word_tokenize(re.sub(r"</?e[1-2]>|\"", "", line)) tagged = pos_tag(text_tokens) # convert the tags to reduced tagset (Petrov et al. 2012) # http://arxiv.org/pdf/1104.2086.pdf tags = [] for t in tagged: tag = map_tag('en-ptb', 'universal', t[1]) tags.append((t[0], tag)) #r = Relationship(None, s, None, None, None) #extractRelationalWords(r) print tags
def tagstr2tree( s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None ): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_label: The label to use for chunk nodes :type chunk_label: str :param root_label: The label to use for the root of the tree :type root_label: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') stack = [Tree(root_label, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == '[': if len(stack) != 1: raise ValueError('Unexpected [ at char {:d}'.format(match.start())) chunk = Tree(chunk_label, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == ']': if len(stack) != 2: raise ValueError('Unexpected ] at char {:d}'.format(match.start())) stack.pop() else: if sep is None: stack[-1].append(text) else: word, tag = str2tuple(text, sep) if source_tagset and target_tagset: tag = map_tag(source_tagset, target_tagset, tag) stack[-1].append((word, tag)) if len(stack) != 1: raise ValueError('Expected ] at char {:d}'.format(len(s))) return stack[0]
def pos_tag(tokens, tagset=None): """ Use NLTK's currently recommended part of speech tagger to tag the given list of tokens. >>> from nltk.tag import pos_tag # doctest: +SKIP >>> from nltk.tokenize import word_tokenize # doctest: +SKIP >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +SKIP [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] :param tokens: Sequence of tokens to be tagged :type tokens: list(str) :return: The tagged tokens :rtype: list(tuple(str, str)) """ tagger = load(_POS_TAGGER) if tagset: return [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagger.tag(tokens)] return tagger.tag(tokens)
def load_web_eng(filename = "", trans = False): lines = list( open(filename, "r").readlines() ) lines = [ l.strip() for l in lines] doc = [] tags = [] sent_w = [] sent_t = [] for l in lines: if l == '': doc.append(sent_w) tags.append(sent_t) sent_w = [] sent_t = [] else: w, t = l.split('\t') if t != "-NONE-": sent_w.append( w.lower() ) if trans: sent_t.append( map_tag('en-ptb', 'universal', t) ) else: sent_t.append( t ) return doc, tags
def morph(self, source, reference, constrain_pos=True): orig_tokenized = MosesTokenizer(lang='en').tokenize(source) pos_tagged = [(token, map_tag("en-ptb", 'universal', tag)) for (token, tag) in self.tagger.tag(orig_tokenized)] pos_tagged = [(tagged[0], '.') if '&' in tagged[0] else tagged for tagged in pos_tagged] token_inflections = self.get_inflections(orig_tokenized, pos_tagged, constrain_pos) original_score, orig_predicted = self.get_score(source, reference) forward_perturbed, forward_score, \ forward_predicted, num_queries_forward = self.search_seq2seq(token_inflections, orig_tokenized, source, original_score, reference) if forward_score == original_score: forward_predicted = orig_predicted if forward_score == 0: return MosesDetokenizer(lang='en').detokenize(forward_perturbed), forward_predicted, num_queries_forward + 1 backward_perturbed, backward_score, \ backward_predicted, num_queries_backward = self.search_seq2seq(token_inflections, orig_tokenized, source, original_score, reference, backward=True) if backward_score == original_score: backward_predicted = orig_predicted num_queries = 1 + num_queries_forward + num_queries_backward if forward_score < backward_score: return MosesDetokenizer(lang='en').detokenize(forward_perturbed), forward_predicted, num_queries else: return MosesDetokenizer(lang='en').detokenize(backward_perturbed), backward_predicted, num_queries
def featurize(indir, pat, jsons): """ construct json file per sentence s.t. every word is described by: its tag, its collapsed tag (according to NLTK 3.0), a function word (bool), negation (bool), vowels, num of syllables, and current past and future of words in terms of the nucleuos and level of prominence """ outdir = pat + '/' + indir.rsplit("/", 1)[-1] if not os.path.exists(outdir): os.makedirs(outdir) past_nuc = '' past_kind = '' number_pattern = re.compile("(\d+)?[.,-]?(\\/)?\d+(th)?('s)?['s]?(\w+)?") hiphend_pattern_num_word_mix = re.compile("(\w+(.\w+)?(\w+)?)-\w+") past_word = [] last = len(jsons) first_dumped = 0 # since not necessarily the first word is dumped for (i, terminal) in enumerate(dict_the_json(indir, jsons)): word = str(terminal["word"].lower()) subword = str(terminal["subword"].lower()) if word != subword: # print word, subword pass try: # if "repair" or not exist keeps going if terminal["disf_stat"] == "reparandum": continue except: pass current_word = [] # if find list of shortcut verb suffix (no "'s") if past_word and (word in ["'d", "'ll", "'n'", "'re", "'ve",\ "n't", "'m"] or (past_word[0][1] + word == subword and \ subword in ["he's", "she's", "it's", "that's", "what's"]) or (past_word[0][1] + word == subword and word != "'s")): past_word[0][1] = past_word[0][1] + word try: phns = d[past_word[0][1].lower()][0] except: # print past_word[0][1].lower() phn1 = d[past_word[0][1][:-len(word)]][0] phn2 = d[str(word)][0] phns = phn1 + phn2 sylls, num_syll, nuc, kind = syll_detector(phns) for k in xrange(7): # max sylls try: current_word.append((k, sylls[k])) except: current_word.append((k, "NA")) #past_word[5][1] = sylls #past_word[6][1] = num_syll #past_word[7][1] = nuc #past_word[8][1] = kind #past_word[12][1] = num_syll past_word[12][1] = nuc past_word[13][1] = kind continue current_word.append(['word', word]) current_word.append(('tag', terminal["tag"])) current_word.append( ('collps_tag', map_tag(Source, target='universal',\ source_tag=terminal["tag"]))) current_word.append(('function', bool(word) in function)) current_word.append(('negation', bool(word) in negation)) try: phns = d[word][0] except: # hiphen seperated if re.findall(hiphend_pattern_num_word_mix, word): phns = [] for w in word.split('-'): try: phn = d[w.lower()][0] phns.extend(phn) except: # and there's a number if re.findall(number_pattern, word): w = 'five' phn = d[w.lower()][0] phns.extend(phn) else: if word == ",": print 'h' # print word f.write(word) f.write('\n') # find if it is a number or dot or pound word = 'name' # unrecognized word or pattern current_word[0][1] = word phns = d[word.lower()][0] sylls, num_syll, nuc, kind = syll_detector(phns) for k in xrange(7): # max sylls try: current_word.append((k, sylls[k])) except: current_word.append((k, "NA")) current_word.append(['nuc', nuc]) current_word.append(['nuc_kind', kind]) try: # dialact current_word.append(("dialAct", terminal["dialAct:niteType"])) current_word.append(("seq", terminal["dialAct:id"])) except: current_word.append(("dialAct", "no")) try: # kontrast current_word.append( ("kontrast level", terminal["kontrast:level"])) current_word.append( ("kontrast type", terminal["kontrast:type"])) except: current_word.append( ("kontrast level", "no")) current_word.append( ("kontrast type", "no")) try: # phrases current_word.append(('phrases', terminal["phrases:type"])) except: current_word.append(('phrases', "no")) try: # accents current_word.append( ('accents_strength', terminal["accents:strength"])) except: pass if first_dumped and i!=last: # if not first dumped word past_word.append(('right_nuc', nuc)) past_word.append(('right_nuc_kind', kind)) # copy past_word to json with open(outdir+'/'+past_id, 'w') as fjson: json.dump(OrderedDict(past_word), fjson, indent=4) current_word.append(('left_nuc', past_nuc)) current_word.append(('left_nuc_kind', past_kind)) # after updating current_word, current nuc becomes past_nuc past_nuc = nuc past_kind = kind if not first_dumped: current_word.append(('left_nuc', 'None')) first_dumped = 1 if i == last: current_word.append(('right_nuc', 'None')) # copy final_word to jason with open(outdir+'/'+past_id, 'w') as fjson: json.dump(OrderedDict(current_word), fjson, indent=4) past_word = copy.deepcopy(current_word) # keep past dict past_id = terminal["id"]
def _pos_tag(tokens, tagset, tagger): tagged_tokens = tagger.tag(tokens) if tagset: tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens] return tagged_tokens
def morph(self, question_dict, context, constrain_pos=True, conservative=False): original = question_dict['question'] gold_starts = [ans['answer_start'] for ans in question_dict['answers']] gold_texts = [ans['text'] for ans in question_dict['answers']] gold_ends = [ gold_starts[i] + len(text) for i, text in enumerate(gold_texts) ] question_dict['gold_char_spans'] = list(zip(gold_starts, gold_ends)) question_dict['gold_texts'] = gold_texts orig_tokenized = MosesTokenizer(lang='en').tokenize(original) pos_tagged = [(token, map_tag("en-ptb", 'universal', tag)) for (token, tag) in self.tagger.tag(orig_tokenized)] pos_tagged = [(tagged[0], '.') if '&' in tagged[0] else tagged for tagged in pos_tagged] token_inflections = super(MorpheusQA, self).get_inflections( orig_tokenized, pos_tagged, constrain_pos) original_loss, init_predicted = self.get_loss(original, question_dict, context) if self.metric_max(compute_f1, init_predicted, question_dict['gold_texts']) == 0: return original, init_predicted, 1 forward_perturbed, \ forward_loss, forward_predicted, \ num_queries_forward = self.search_qa(token_inflections, orig_tokenized, original_loss, question_dict, context, conservative) if conservative and self.metric_max(compute_f1, forward_predicted, question_dict['gold_texts']) == 0: return MosesDetokenizer(lang='en').detokenize( forward_perturbed), forward_predicted, num_queries_forward + 1 backward_perturbed, \ backward_loss, backward_predicted, \ num_queries_backward = self.search_qa(token_inflections, orig_tokenized, original_loss, question_dict, context, conservative, backward=True) num_queries = 1 + num_queries_forward + num_queries_backward if forward_loss > backward_loss: return MosesDetokenizer(lang='en').detokenize( forward_perturbed), forward_predicted, num_queries else: return MosesDetokenizer(lang='en').detokenize( backward_perturbed), backward_predicted, num_queries
def generate_dataset_json(files, out_file, lemma_know=None, relationship_know=None): d = {} lemmas = set() stemmer = PorterStemmer() if os.path.isfile(out_file): with open(out_file, 'r') as f: data = f.read() d = json.loads(data) if lemma_know is not None: d.update(lemma_know) if relationship_know is None: relationship_know = {} for f in files: with open(f) as file: for line in tqdm.tqdm(file): l = line.strip() tokens = l.split() if len(l) == 0 or tokens[12] != 'Y': continue lemma = tokens[2] pos = tokens[4] if lemma == '%': k = '%25_' + mapping.map_tag('wsj', 'universal', pos) else: k = lemma.lower() + '_' + mapping.map_tag( 'wsj', 'universal', pos) lemmas.add(k) lemmas = sorted(list(lemmas)) for l in tqdm.tqdm(lemmas): if l not in d: lemma, pos = l.rsplit('_', 1) print(lemma, pos, 'to add.') # print(mapping.map_tag('wsj', 'universal', pos)) b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY) if b == -1: return d if len(b) == 0: b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY, pos='POS') if len(b) == 0: b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY, wn=False) if len(b) == 0: b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY, wn=False, pos='POS') if len(b) == 0: lemma = stemmer.stem(lemma) print('Stemmer used: ', lemma) b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY) if len(b) == 0: b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY, pos='POS') if len(b) == 0: b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY, wn=False) if len(b) == 0: b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY, wn=False, pos='POS') print(b) # assert(len(b) > 0) to_add = {} for s in b: links = relationship_know.get( s, utils.getSemanticRelatioshipBabelnet(s, BABEL_KEY)) to_add.update({s: links}) d.update({l: to_add}) with open(out_file, 'w') as f: f.write(json.dumps(d, ensure_ascii=False)) else: print(l, 'already present') with open(out_file, 'w') as f: f.write(json.dumps(d, ensure_ascii=False)) return d
def extract_reverb_patterns(text): """ Extract ReVerb relational patterns http://homes.cs.washington.edu/~afader/bib_pdf/emnlp11.pdf VERB - verbs (all tenses and modes) NOUN - nouns (common and proper) PRON - pronouns ADJ - adjectives ADV - adverbs ADP - adpositions (prepositions and postpositions) CONJ - conjunctions DET - determiners NUM - cardinal numbers PRT - particles or other function words X - other: foreign words, typos, abbreviations . - punctuation # extract ReVerb patterns: # V | V P | V W*P # V = verb particle? adv? # W = (noun | adj | adv | pron | det) # P = (prep | particle | inf. marker) """ # split text into tokens text_tokens = PunktWordTokenizer().tokenize(text) # tag the sentence, using the default NTLK English tagger # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tags_ptb = pos_tag(text_tokens) # convert the tags to reduced tagset (Petrov et al. 2012) # http://arxiv.org/pdf/1104.2086.pdf tags = [] for t in tags_ptb: tag = map_tag('en-ptb', 'universal', t[1]) tags.append((t[0], tag)) patterns = [] patterns_tags = [] i = 0 limit = len(tags)-1 while i <= limit: tmp = StringIO.StringIO() tmp_tags = [] # a ReVerb pattern always starts with a verb if tags[i][1] == 'VERB': tmp.write(tags[i][0]+' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # V = verb particle? adv? (also capture auxiliary verbs) while i <= limit and tags[i][1] in ['VERB', 'PRT', 'ADV']: tmp.write(tags[i][0]+' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # W = (noun | adj | adv | pron | det) while i <= limit and tags[i][1] in ['NOUN', 'ADJ', 'ADV', 'PRON', 'DET']: tmp.write(tags[i][0]+' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # P = (prep | particle | inf. marker) while i <= limit and tags[i][1] in ['ADP', 'PRT']: tmp.write(tags[i][0]+' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # add the build pattern to the list collected patterns patterns.append(tmp.getvalue()) patterns_tags.append(tmp_tags) i += 1 return patterns, patterns_tags
\tlabelloc="t"; \tlabel="%s"; \t""" %( '\\n'.join( textwrap.wrap(self.sentence) ) ) output += u'\n\t'.join(map(operator.attrgetter('dot_str'), self.nodes)) output += u'\n\n\t' output += u'\n\t'.join(map(operator.attrgetter('dot_str'), self.edges)) output += u'\n' output += u'};\n' return output map_tag_to_universal = lambda tag: map_tag('en-ptb', 'universal', tag) STANFORD_ATTRIBUTES = {"Text": {"name": "token", "type": unicode}, "PartOfSpeech": {"name": "pos_tag", "type": str, "mapping_func": map_tag_to_universal}, "Lemma": {"name": "lemma", "type": str}} def parse_token_line(l, prepend_root = True): """ Parsing the line containing tokens and POS tags information l: str The token&POS line prepend_root: bool If True, root is automatically prepended to the list >>> tokens = parse_token_line(u"[Text=Schneider CharacterOffsetBegin=0 CharacterOffsetEnd=9 PartOfSpeech=NNP] [Text=Electric CharacterOffsetBegin=10 CharacterOffsetEnd=18 PartOfSpeech=NNP]", prepend_root = True)
import sys from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from providedcode.dependencygraph import DependencyGraph from nltk.tag import mapping if len(sys.argv) != 2: sys.stderr.write("No model provided.") sys.exit(1) tp = TransitionParser.load(sys.argv[1]) for sentence in sys.stdin: s = DependencyGraph.from_sentence(sentence) #class DependencyGraph, function from_sentence for node in s.nodes: tag = s.nodes[node]['tag'] ctag = mapping.map_tag('wsj','universal',tag) s.nodes[node]['ctag'] = ctag x = tp.parse([s]) print x[0].to_conll(10).encode('utf-8') # model: sys.argv(1) - english.model
def extract_reverb_patterns(text): """ Extract ReVerb relational patterns http://homes.cs.washington.edu/~afader/bib_pdf/emnlp11.pdf VERB - verbs (all tenses and modes) NOUN - nouns (common and proper) PRON - pronouns ADJ - adjectives ADV - adverbs ADP - adpositions (prepositions and postpositions) CONJ - conjunctions DET - determiners NUM - cardinal numbers PRT - particles or other function words X - other: foreign words, typos, abbreviations . - punctuation # extract ReVerb patterns: # V | V P | V W*P # V = verb particle? adv? # W = (noun | adj | adv | pron | det) # P = (prep | particle | inf. marker) """ # split text into tokens text_tokens = PunktWordTokenizer().tokenize(text) # tag the sentence, using the default NTLK English tagger # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tags_ptb = pos_tag(text_tokens) # convert the tags to reduced tagset (Petrov et al. 2012) # http://arxiv.org/pdf/1104.2086.pdf tags = [] for t in tags_ptb: tag = map_tag('en-ptb', 'universal', t[1]) tags.append((t[0], tag)) patterns = [] patterns_tags = [] i = 0 limit = len(tags) - 1 while i <= limit: tmp = StringIO.StringIO() tmp_tags = [] # a ReVerb pattern always starts with a verb if tags[i][1] == 'VERB': tmp.write(tags[i][0] + ' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # V = verb particle? adv? (also capture auxiliary verbs) while i <= limit and tags[i][1] in ['VERB', 'PRT', 'ADV']: tmp.write(tags[i][0] + ' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # W = (noun | adj | adv | pron | det) while i <= limit and tags[i][1] in [ 'NOUN', 'ADJ', 'ADV', 'PRON', 'DET' ]: tmp.write(tags[i][0] + ' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # P = (prep | particle | inf. marker) while i <= limit and tags[i][1] in ['ADP', 'PRT']: tmp.write(tags[i][0] + ' ') t = (tags[i][0], tags[i][1]) tmp_tags.append(t) i += 1 # add the build pattern to the list collected patterns patterns.append(tmp.getvalue()) patterns_tags.append(tmp_tags) i += 1 return patterns, patterns_tags
Tagging part of speech Use maxent treebank pos tagging model in NLTK by default Each consisting of a list of tokens """ print string.punctuation with codecs.open("res/sentences_train.csv", "rU") as f: csvreader = csv.reader(f) for row in csvreader: # remove punctuation # Ignore ascii decode error sentence = row[0].translate(string.maketrans("", ""), punctuation).decode('ascii', 'ignore') text = nltk.word_tokenize(sentence) original_tag = nltk.pos_tag(text) simplified_tag = [(word, map_tag('en-ptb','universal', tag)) for word, tag in original_tag] # Map original Map to universla tags simplified_tag = [(u'START',u'START')] + simplified_tag + [(u'END',u'END')] # manually add two tags #tokens.append(simplified_tag) tokens.extend(simplified_tag) print tokens[0:10] # TODO:BenchMark # Using word association to generate suggestion bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(word for word,tag in tokens) scored = finder.score_ngrams(bigram_measures.raw_freq) print sorted(bigram for bigram, score in scored) finder.apply_freq_filter(2) scored = finder.score_ngrams(bigram_measures.raw_freq)