예제 #1
0
    def __init__(self, w2v_path, corpus_dict_path, port=9000):
        # corenlp client
        self.parser = CoreNLPParser(url='http://localhost:' + str(port))
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' +
                                                  str(port))
        # w2v
        self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
            'data/saved_models/GoogleNews-vectors-negative300.bin',
            binary=True)
        print('w2v model loaded')
        # training corpus for one hot features
        corpus_dict = pickle.load(open(corpus_dict_path, 'rb'))

        self.dep_tuple_vectorizer = DictVectorizer(sparse=False)
        self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit(
            corpus_dict['dep_tuple'])

        self.unigram_vectorizer = DictVectorizer(sparse=False)
        self.unigram_vectorizer = self.unigram_vectorizer.fit(
            corpus_dict['unigram'])

        self.bigram_vectorizer = DictVectorizer(sparse=False)
        self.bigram_vectorizer = self.bigram_vectorizer.fit(
            corpus_dict['bigram'])

        self.trigram_vectorizer = DictVectorizer(sparse=False)
        self.trigram_vectorizer = self.trigram_vectorizer.fit(
            corpus_dict['trigram'])

        self.lexical_vectorizer = DictVectorizer(sparse=False)
        self.lexical_vectorizer = self.lexical_vectorizer.fit(
            corpus_dict['lexical'])
def depParser(sent):
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    result = dep_parser.raw_parse(sent)
    newResult = result.__next__()
    dep = list(newResult.triples())

    return dep
예제 #3
0
    for i in graph2:
        counts2[i[1]] = counts2.get(i[1], 0) + 1

    all_deps = set(list(counts1.keys()) + list(counts2.keys()))
    diffs = 0
    for dep in all_deps:
        diffs += abs(counts1.get(dep, 0) - counts2.get(dep, 0))
    return diffs


########################################################################################################
# Main code
########################################################################################################

# initialize the dependency parser
chi_parser = CoreNLPDependencyParser('http://localhost:9001')

# use nltk treebank tokenizer and detokenizer
tokenizer = TreebankWordTokenizer()
detokenizer = TreebankWordDetokenizer()

# BERT initialization
berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased')
bertmodel.eval()

# initialize the Google translate client
translate_client = translate.Client()

print('initialized')
 #Setting up geography category
 conn=part1.create_connection(os.path.join("Database","WorldGeography.sqlite"))
 if not(conn is None):
  geog_db=part1.geography_db(conn)


 #Getting the set of similar words from wordnet
 geog_set=part1.create_lists(geog)   
 mov_set=part1.create_lists(movies_list)
 music_set=part1.create_lists(music_list)
 
 #Getting the tags
 qstn,ner,pos=part1.tagging(filename)
 parser = CoreNLPParser(url='http://localhost:9000')
 dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
 # Printing parse tree and output
 for i in range(0,len(qstn)):
  qtype="YesNo"
  print("Question: ", qstn[i].strip())
  parsed=part1.parse_tree(qstn[i],parser)
  dep_parsed=return_deptree(qstn[i],dep_parser)
  #print(list(dep_parsed.triples()),parsed,ner[i])
  #continue
  if 'SBAR' in parsed[0:15]:
   qtype="WH"
  category=part1.categorize(qstn[i],ner[i],pos[i],geog_set,mov_set,music_set, mus_name,songs,geog_db)
  print(category)
  
  if category=="Music":
   
예제 #5
0
lap_14_train_txt = os.path.join(config['lap_14'], 'train.tsv')
lap_14_test_txt = os.path.join(config['lap_14'], 'test.tsv')

res_14_train_txt = os.path.join(config['res_14'], 'train.tsv')
res_14_test_txt = os.path.join(config['res_14'], 'test.tsv')

res_15_train_txt = os.path.join(config['res_15'], 'train.tsv')
res_15_test_txt = os.path.join(config['res_15'], 'test.tsv')

res_16_train_txt = os.path.join(config['res_16'], 'train.tsv')
res_16_test_txt = os.path.join(config['res_16'], 'test.tsv')

POLARITY_DICT = {'NEU': 0, 'POS': 1, 'NEG': 2}
POLARITY_DICT_REV = {v: k for k, v in POLARITY_DICT.items()}

depparser = CoreNLPDependencyParser(url='http://172.28.6.42:9000')


def load_data(txt_path, pair_path):
    """

    :param txt_path: the original annotation file path
    :param pair_path: the processed pair file path
    :return:
    """
    pairs = read_pickle(pair_path)
    data_list = []
    with open(txt_path, encoding='utf-8') as f:
        texts = f.readlines()
    assert len(pairs) == len(texts)
    for idx, (t, p) in enumerate(zip(texts, pairs)):
예제 #6
0
class FeatureExtractor:
    def __init__(self, w2v_path, corpus_dict_path, port=9000):
        # corenlp client
        self.parser = CoreNLPParser(url='http://localhost:' + str(port))
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' +
                                                  str(port))
        # w2v
        self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
            'data/saved_models/GoogleNews-vectors-negative300.bin',
            binary=True)
        print('w2v model loaded')
        # training corpus for one hot features
        corpus_dict = pickle.load(open(corpus_dict_path, 'rb'))

        self.dep_tuple_vectorizer = DictVectorizer(sparse=False)
        self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit(
            corpus_dict['dep_tuple'])

        self.unigram_vectorizer = DictVectorizer(sparse=False)
        self.unigram_vectorizer = self.unigram_vectorizer.fit(
            corpus_dict['unigram'])

        self.bigram_vectorizer = DictVectorizer(sparse=False)
        self.bigram_vectorizer = self.bigram_vectorizer.fit(
            corpus_dict['bigram'])

        self.trigram_vectorizer = DictVectorizer(sparse=False)
        self.trigram_vectorizer = self.trigram_vectorizer.fit(
            corpus_dict['trigram'])

        self.lexical_vectorizer = DictVectorizer(sparse=False)
        self.lexical_vectorizer = self.lexical_vectorizer.fit(
            corpus_dict['lexical'])

    def _get_case_features(self, sent_annotations, sentence):
        num_all_caps = 0
        for word_annotations in sent_annotations:
            if word_annotations.token.isupper():
                num_all_caps += 1
        if sentence.islower():
            is_sent_lower = 1
        else:
            is_sent_lower = 0
        if sent_annotations[0].token.isupper():
            is_first_word_caps = 1
        else:
            is_first_word_caps = 0
        return [num_all_caps, is_sent_lower, is_first_word_caps]

    def _get_dependency_tuples(self, sent_annotations):
        # (gov, typ, dep)  (gov, typ)  (typ, dep)  (gov, dep)
        dependency_tuple_dict = defaultdict(int)
        for word_annotations in sent_annotations:
            gov = sent_annotations[int(word_annotations.head) - 1].pos
            typ = word_annotations.depRel
            dep = word_annotations.pos
            gov_typ_dep = '_'.join([gov, typ, dep])
            dependency_tuple_dict[gov_typ_dep] = 1
            gov_typ = '_'.join([gov, typ])
            dependency_tuple_dict[gov_typ] = 1
            typ_dep = '_'.join([typ, dep])
            dependency_tuple_dict[typ_dep] = 1
            gov_dep = '_'.join([gov, dep])
            dependency_tuple_dict[gov_dep] = 1
        return dependency_tuple_dict

    def _get_entity_features(self, sent_annotations):
        ner_tags = [0] * len(NER_TAGSET)
        person_mentions_total_len = 0
        for word_annotations in sent_annotations:
            if word_annotations.ner == 'O':
                continue
            if word_annotations.ner not in NER_TAGSET:
                continue
            else:
                index = NER_TAGSET.index(word_annotations.ner)
                ner_tags[index] = 1
            if word_annotations.ner == 'PERSON':
                person_mentions_total_len += len(word_annotations.token)
        person_mentions_avg_len = person_mentions_total_len * 1.0 / len(
            sent_annotations)
        return ner_tags + [person_mentions_avg_len]

    def _get_lexical_features(self, words):
        num_contractions = 0
        total_word_len = 0
        for word in words:
            if '\'' in word:
                num_contractions += 1
            total_word_len += len(word)
        avg_num_contractions = num_contractions * 1.0 / len(words)
        avg_word_len = total_word_len * 1.0 / len(words)
        #TODO: avg word-log frequency acc to Google Ngram
        #TODO: avg formality score using Pavlick & Nenkova (2015)
        return [avg_num_contractions, avg_word_len]

    def _get_ngrams(self, sent_annotations):
        # tokens = [w.token for w in sent_annotations]
        tokens = [w.lemma for w in sent_annotations]
        sentence = ' '.join(tokens)
        # .decode('utf-8', 'ignore')
        blob = TextBlob(sentence)
        unigrams = tokens
        bigrams = blob.ngrams(n=2)
        trigrams = blob.ngrams(n=3)
        unigram_dict = defaultdict(int)
        bigram_dict = defaultdict(int)
        trigram_dict = defaultdict(int)
        for unigram in unigrams:
            unigram_dict[unigram] = 1
        for bigram in bigrams:
            bigram_dict['_'.join(bigram)] = 1
        for trigram in trigrams:
            trigram_dict['_'.join(trigram)] = 1
        return unigram_dict, bigram_dict, trigram_dict

    def _get_parse_features(self, stanford_parse_tree, sent_annotations):
        sent_len = len(sent_annotations)
        avg_depth = stanford_parse_tree.height() * 1.0 / sent_len
        lexical_production_dict = defaultdict(int)
        for production in stanford_parse_tree.productions():
            if production.is_lexical():
                continue
            lexical_production_dict[production] += 1
        avg_depth_feature = [avg_depth]
        return avg_depth_feature, lexical_production_dict

    def _get_POS_features(self, sent_annotations):
        pos_tag_ct = [0] * len(POS_TAGSET)
        for word_annotations in sent_annotations:
            try:
                pos_tag_ct[POS_TAGSET.index(word_annotations.pos)] += 1
            except:
                # print word_annotations.pos
                continue
        for i in range(len(pos_tag_ct)):
            pos_tag_ct[i] = pos_tag_ct[i] * 1.0 / len(sent_annotations)
        return pos_tag_ct

    def _get_punctuation_features(self, sentence):
        num_question_marks = sentence.count('?')
        num_ellipses = sentence.count('...')
        num_exclamations = sentence.count('!')
        return [num_question_marks, num_ellipses, num_exclamations]

    def _get_readability_features(self, sentence, words):
        num_words = len(words)
        num_chars = len(sentence) - sentence.count(' ')
        return [num_words, num_chars]

    def _get_subjectivity_features(self, sent_annotations, sentence):
        subjectivity_features = []
        fp_pros = 0
        tp_pros = 0
        for word_annotations in sent_annotations:
            if word_annotations.lemma in FP_PRO_LIST:
                fp_pros += 1
            if word_annotations.lemma in TP_PRO_LIST:
                tp_pros += 1
        subjectivity_features.append(fp_pros * 1.0 / len(sent_annotations))
        subjectivity_features.append(tp_pros * 1.0 / len(sent_annotations))
        polarity, subjectivity = TextBlob(sentence).sentiment
        subjectivity_features.append(float(np.sign(polarity)))
        subjectivity_features.append(subjectivity)
        return subjectivity_features

    def _get_word2vec_features(self, sent_annotations):
        word_vectors = []
        for word_annotations in sent_annotations:
            try:
                word_vector = self.word2vec_model[word_annotations.lemma]
                word_vectors.append(word_vector)
            except:
                # print word_annotations.token
                continue
        if len(word_vectors) == 0:
            avg_word_vectors = np.zeros(300)
        else:
            avg_word_vectors = np.transpose(np.mean(word_vectors, axis=0))
        return avg_word_vectors

    def _remove_less_frequent(self, dict, reference_dict, freq_cutoff):
        new_dict = defaultdict(int)
        for item, count in dict.iteritems():
            if reference_dict[item] > freq_cutoff:
                new_dict[item] = count
        return new_dict

    def extract_features_pt16(self, sentence, sent_annotations, parse_tree):
        words = sentence.split()
        feature_set = []
        #case features
        case_features = self._get_case_features(sent_annotations, sentence)
        feature_set += case_features

        # dependency features
        dependency_tuple_dict = self._get_dependency_tuples(sent_annotations)

        # entity features
        entity_features = self._get_entity_features(sent_annotations)
        feature_set += entity_features

        # lexical features
        lexical_features = self._get_lexical_features(words)
        feature_set += lexical_features

        # ngram features
        unigram_dict, bigram_dict, trigram_dict = self._get_ngrams(
            sent_annotations)

        # parse features
        avg_depth_feature, lexical_production_dict = self._get_parse_features(
            parse_tree, sent_annotations)
        feature_set += avg_depth_feature

        # POS features
        pos_features = self._get_POS_features(sent_annotations)
        feature_set += pos_features

        # punctuation features
        punctuation_features = self._get_punctuation_features(sentence)
        feature_set += punctuation_features

        # readability features
        readability_features = self._get_readability_features(sentence, words)
        feature_set += readability_features

        # subjectivity features
        # subjectivity_features = self._get_subjectivity_features(sent_annotations, sentence)
        # feature_set += subjectivity_features

        # word2vec features
        word2vec_features = self._get_word2vec_features(sent_annotations)
        feature_set = np.concatenate((feature_set, word2vec_features), axis=0)

        # get one hot features
        dependency_tuple_feature = self.dep_tuple_vectorizer.transform(
            dependency_tuple_dict)
        unigram_feature = self.unigram_vectorizer.transform(unigram_dict)
        bigram_feature = self.bigram_vectorizer.transform(bigram_dict)
        trigram_feature = self.trigram_vectorizer.transform(trigram_dict)
        lexical_production_feature = self.lexical_vectorizer.transform(
            lexical_production_dict)

        feature_vectors = np.array([feature_set])
        feature_vectors = np.concatenate(
            (feature_vectors, dependency_tuple_feature, unigram_feature,
             bigram_feature, trigram_feature, lexical_production_feature),
            axis=1)

        return feature_vectors

    def _transform_raw(self, sentence):
        sent_annotations = []

        for dependency in sentence['basicDependencies']:
            dep_idx = dependency['dependent']
            token = sentence['tokens'][dep_idx - 1]

            annotation = StanfordAnnotations(token['word'], token['lemma'],
                                             token['pos'], token['ner'],
                                             dependency['governor'],
                                             dependency['dep'])
            sent_annotations.append(annotation)

        return sent_annotations

    def extract_parse(self, s):
        """
        Easy, built in parser from nltk
        """
        tree_list = self.parser.raw_parse(s, outputFormat='penn')
        tree = next(tree_list)
        return tree

    def extract_annotations(self, s):
        """
        Needs some arm wrestling
        """

        props = {'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,dcoref'}
        raw_json = self.dep_parser.api_call(s, properties=props)
        sentence = raw_json['sentences'][0]
        return self._transform_raw(sentence)
예제 #7
0
import pandas as pd
from nltk.parse import CoreNLPDependencyParser



parser = CoreNLPDependencyParser(url='http://localhost:9001')
fin = 'oie_corpus/science_eval.oie'
fout = 'oie_corpus/science_eval.oie.correct.head'

with open(fin) as fi, open(fout, 'a') as fo:
	for line in fi:
		data = line.strip().split('\t')
		sent = data[0]
		try:
			parse, = parser.raw_parse(sent)
		except:
			continue
		df = pd.DataFrame([x.split('\t') for x in parse.to_conll(3).split('\n')], columns = ['word', 'pos', 'depth'])

		line = line.rstrip()
		line += '\t' + '<SYN_HEAD>'
		args = data[1:]

		word_list = list(df['word'])
		depth_list = list(df['depth'])
		for arg in args:
			arg = arg.split(' ')
			for i, w in enumerate(word_list):
				if word_list[i:i+len(arg)] == arg:
					print(arg)
					candidate_words = word_list[i:i+len(arg)]