def __init__(self, _sentence): self.relationships = set() self.sentence = _sentence matches = [] for m in re.finditer(regex, self.sentence): matches.append(m) if len(matches) >= 2: for x in range(0, len(matches) - 1): if x == 0: start = 0 if x > 0: start = matches[x - 1].end() try: end = matches[x + 2].start() except IndexError: end = len(self.sentence) - 1 before = self.sentence[start:matches[x].start()] between = self.sentence[matches[x].end():matches[x + 1].start()] after = self.sentence[matches[x + 1].end():end] # select only a few tokens from left and right context # TODO: read the context values from parameters.cfg before = PunktWordTokenizer().tokenize(before)[-2:] after = PunktWordTokenizer().tokenize(after)[:2] before = ' '.join(before) after = ' '.join(after) # only consider relationships where the distance between the two entities # is less than 8 tokens # TODO: read the context window size from parameters.cfg if not len(PunktWordTokenizer().tokenize(between)) > 8: ent1 = matches[x].group() ent2 = matches[x + 1].group() arg1match = re.match("<[A-Z]+>", ent1) arg2match = re.match("<[A-Z]+>", ent2) ent1 = re.sub("</?[A-Z]+>", "", ent1, count=2, flags=0) ent2 = re.sub("</?[A-Z]+>", "", ent2, count=2, flags=0) arg1type = arg1match.group()[1:-1] arg2type = arg2match.group()[1:-1] rel = Relationship(_sentence, before, between, after, ent1, ent2, arg1type, arg2type, _type=None, _id=None) self.relationships.add(rel)
def extract_patterns(self, config): # http://www.ling.upenn.edu/courses/Fall_2007/ling001/penn_treebank_pos.html # select everything except stopwords and ADJ, ADV filter_pos = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'WRB'] """ Extract ReVerb patterns and construct Word2Vec representations""" patterns_bet, patterns_bet_tags = Reverb.extract_reverb_patterns(self.bet) if len(patterns_bet) > 0: self.patterns_words = patterns_bet pattern = [t[0] for t in patterns_bet_tags[0] if t[0].lower() not in config.stopwords and t[1] not in filter_pos] if len(pattern) >= 1: pattern_vector_bet = Word2VecWrapper.pattern2vector(pattern, config) self.patterns_vectors.append(pattern_vector_bet) else: """ If no ReVerb patterns are found extract words from context """ # split text into tokens text_tokens = PunktWordTokenizer().tokenize(self.bet) # tag the sentence, using the default NTLK English tagger # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tags_ptb = pos_tag(text_tokens) pattern = [t[0] for t in tags_ptb if t[0].lower() not in config.stopwords and t[1] not in filter_pos] if len(pattern) >= 1: pattern_vector_bet = Word2VecWrapper.pattern2vector(pattern, config) self.patterns_vectors.append(pattern_vector_bet) self.patterns_words = pattern
def preprocess(file_contents, add_sent_markers=True): raw = clean_html(file_contents) raw = re.sub(r'\d+:\d+|\d+,\d+,|IsTruthFul,IsPositive,review', "", raw) sentence_list = tokenize.sent_tokenize(raw) if add_sent_markers: sentence_list = [('<s> ' + sentence + ' </s>') for sentence in sentence_list] word_lists = [PunktWordTokenizer().tokenize(sentence) for sentence in sentence_list] word_list = [item for sublist in word_lists for item in sublist] return word_list
def sloTag2(uText): sents = sent_tokenizer.tokenize(uText) tokens = [] for s in sents: if s[-1] == '.': s = s[:-1] + " ." t = PunktWordTokenizer().tokenize(s) tokens += t result = tagger.tag(tokens) return result
def extract_knowledge(title, text_study): from nltk import PunktWordTokenizer from nltk.corpus import stopwords stop = set(stopwords.words('english')) pwt = PunktWordTokenizer() tokens = pwt.tokenize(text_study) tokens = [i for i in tokens if i.lower() not in stop] hist = {} for token in tokens: if len(token) == 1: continue if token.lower() in hist: hist[token.lower()] += 1 else: hist[token.lower()] = 1 # now that word freqs are received, find n-best sorted_hist = sorted(hist.items(), key=operator.itemgetter(1), reverse=True) for i in sorted_hist: print i
def preprocess(file_contents, add_sent_markers=True): """ :rtype : object :param file_contents: contents of the file that needs to be preprocessed :param add_sent_markers: flag to enable addition of sentence start and end markers. True by default. :return: list of tokenized words """ raw = clean_html(file_contents) raw = re.sub(r'\d+:\d+|\d+,\d+,|IsTruthFul,IsPositive,review', "", raw) sentence_list = tokenize.sent_tokenize(raw) if add_sent_markers: sentence_list = [('<s> ' + sentence + ' </s>') for sentence in sentence_list] word_lists = [PunktWordTokenizer().tokenize(sentence) for sentence in sentence_list] word_list = [item for sublist in word_lists for item in sublist] return word_list
def similarity_all(t, extraction_pattern, config): """ Cosine similarity between all patterns part of a Cluster/Extraction Pattern and the vector of a ReVerb pattern extracted from a sentence """ good = 0 bad = 0 max_similarity = 0 for p in list(extraction_pattern.patterns_words): tokens = PunktWordTokenizer().tokenize(p) vector = Word2VecWrapper.pattern2vector(tokens, config) score = dot(matutils.unitvec(t.patterns_vectors[0]), matutils.unitvec(vector)) if score > max_similarity: max_similarity = score if score >= config.threshold_similarity: good += 1 else: bad += 1 if good >= bad: return True, max_similarity else: return False, 0.0
def tokenize(self, text): return [ word for word in PunktWordTokenizer().tokenize(text.lower()) if word not in self.config.stopwords ]
def sloTag(uText): tokens = PunktWordTokenizer().tokenize(uText) result = tagger.tag(tokens) return result