from TweetNormalizer import normalizeTweet import torch import tqdm import _pickle as cPickle from pprint import pprint import spacy import wikipedia from wikipedia2vec import Wikipedia2Vec import nltk from nltk.corpus import stopwords stop_words = stopwords.words('english') nlp = spacy.load("en_core_web_sm") # Loading data dr_tr = DataReader('./Data/olid-training-v1.0.tsv', 'A') tr_data, tr_labels = dr_tr.get_labelled_data() dr_tst = DataReader('./Data/testset-levela.tsv', 'A') tst_data, tst_label = dr_tst.get_test_data() tst_data = tst_data[:] tst_labels = tst_label[:] tr_data = tr_data[:] tr_labels = tr_label[:] tr_entities = [] tst_entities = [] # Generating Noun-Phrase chunks using SpaCy for line in tqdm.tqdm(tr_data): temp = [] doc = nlp(normalizeTweet(line))
import tqdm import _pickle as cPickle from DataReader import DataReader from dandelion import DataTXT from dandelion import default_config from TweetNormalizer import normalizeTweet # Initializing Dandelion API (can be obtained from https://dandelion.eu/) default_config['token'] = 'INSERT TOKEN' datatxt = DataTXT() # Loading data dr_tr = DataReader('./Data/olid-training-v1.tsv', 'A') data_tr, labels_tr = dr_tr.get_labelled_data() dr_tst = DataReader('./Data/testset-levela.tsv', 'A') data_tst, label_tst = dr_tst.get_test_data() data_tr = data_tr[:] data_tst = data_tst[:] entities_tr = [] entities_tst = [] # Entity extraction using dandelion for line in tqdm.tqdm(data_tr): temp = [] for annotation in datatxt.nex(normalizeTweet(line), lang='en').annotations: temp.append(annotation.title) entities_tr.append(temp) for line in tqdm.tqdm(data_tst):
from DataReader import DataReader from TweetNormalizer import normalizeTweet import tqdm import _pickle as cPickle import stanza import nltk from nltk.corpus import stopwords import os from stanza.server import CoreNLPClient os.environ['CORENLP_HOME']='/data/users/abhavan/stanford-corenlp-4.0.0' def noun_phrases(_client, _text, _annotators=None): pattern = 'NP' matches = _client.tregex(_text,pattern,annotators=_annotators) print("\n".join(["\t"+sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence])) dr = DataReader('./Data/olid-training-v1.0.tsv','A') data,labels = dr.get_labelled_data() data = data[:] with CoreNLPClient(timeout=30000, memory='16G') as client: englishText = "She should ask a few native Americans what their take on this is." print('---') print(englishText) noun_phrases(client,englishText,_annotators="tokenize,ssplit,pos,lemma,parse")