def count_freqs(input_loc, output_loc): print(output_loc) vocab = English.default_vocab(get_lex_attr=None) tokenizer = Tokenizer.from_dir(vocab, path.join(English.default_data_dir(), 'tokenizer')) counts = PreshCounter() for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with io.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string))
class SpacyParser(object): '''https://spacy.io/#example-use''' def __init__(self, num_threads=4): self.nlp = English(tokenizer=True, parser=True, tagger=True, entity=None, matcher=None) def parse(self, doc, doc_id=None): """Parse a raw document as a string into a list of sentences""" if len(doc.strip()) == 0: return doc = doc.decode("utf-8") for doc in self.nlp.pipe([doc], batch_size=50, n_threads=4): assert doc.is_parsed for sent_id, sent in enumerate(doc.sents): tokens = [t for t in sent] token_idxs = [t.idx for t in sent] words = [t.text for t in sent] lemmas = [self.nlp.vocab.strings[t.lemma] for t in tokens] poses = [self.nlp.vocab.strings[t.tag] for t in tokens] dep_labels = [self.nlp.vocab.strings[t.dep] for t in tokens] # index tokens to determine sentence offset for dependency tree token_idx = {t:i for i,t in enumerate(tokens)} dep_parents = [token_idx[t.head] for t in tokens] s = Sentence(words=words,lemmas=lemmas,poses=poses, dep_parents=dep_parents, dep_labels=dep_labels, sent_id=sent_id, doc_id=doc_id, text=sent.text, token_idxs=token_idxs, doc_name=doc_id ) yield s
def test_thinc_load(self): data_dir = English.default_data_dir() model_loc = path.join(data_dir, 'deps', 'model') # n classes. moves.n_moves above # n features. len(templates) + 1 above model = LinearModel(92, 116) model.load(model_loc)
def vocab(): vocab = English.default_vocab() lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] lex = vocab['quick'] lex = vocab['jumped'] return vocab
def test_load_careful(self): config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1} data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) moves = ArcEager(vocab.strings, config_data['labels']) templates = get_templates(config_data['features']) model = Model(moves.n_moves, templates, path.join(data_dir, 'deps')) parser = Parser(vocab.strings, moves, model)
def vocab(): data_dir = os.environ.get('SPACY_DATA') if data_dir is None: package = util.get_package_by_name('en') else: package = util.get_package(data_dir) vocab = English.default_vocab(package=package) lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] lex = vocab['quick'] lex = vocab['jumped'] return vocab
def main(): nlp = English(parser=False, tagger=False, entity=False) gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones'] example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.' pattern_ids = PreshMap() max_length = 0 for pattern_str in gazetteer: pattern = nlp.tokenizer(pattern_str) bilou_tags = get_bilou(len(pattern)) for word, tag in zip(pattern, bilou_tags): lexeme = nlp.vocab[word.orth] lexeme.set_flag(tag, True) pattern_ids[hash_string(pattern.text)] = True max_length = max(max_length, len(pattern)) matcher = make_matcher(nlp.vocab, max_length) doc = nlp(example_text) matches = get_matches(matcher, pattern_ids, doc) merge_matches(doc, matches) for token in doc: print(token.text, token.ent_type_)
def preprocess(texts): nlp = English() docs = nlp.pipe(texts) for doc in docs: for np in doc.noun_chunks: # Only keep adjectives and nouns, e.g. "good ideas" while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'): np = np[1:] if len(np) > 1: # Merge the tokens, e.g. good_ideas np.merge(np.root.tag_, np.text, np.root.ent_type_) # Iterate over named entities for ent in doc.ents: if len(ent) > 1: # Merge them into single tokens ent.merge(ent.root.tag_, ent.text, ent.label_) sentences = [] for sent in doc.sents: sentences.append([token.text for token in sent]) yield sentences
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
corpus = Corpus(filename=download("parliament-corpus")) questionTypology = QuestionTypology(corpus, data_dir, dataset_name='parliament', motifs_dir=motifs_dir, num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=164) #Preprocessing #create spacy object spacy_NLP = spacy.load('en') vocab = English().vocab question_fit_file = os.path.join(questionTypology.motifs_dir, 'question_fits.json') superset_file = os.path.join(questionTypology.motifs_dir, 'question_supersets_arcset_to_super.json') question_to_leaf_fits = [] question_threshold = questionTypology.question_threshold super_mappings = {} with open(superset_file) as f: for line in f.readlines(): entry = json.loads(line)
temp.append(nlp.vocab.strings[token.dep]) temp.append([token.head.orth_,t[token.head.idx]]) dep_triple.append(temp) return dep_triple if __name__=='__main__': #print 'data' if len(sys.argv) !=3: print 'usage: python pyfile dir_path input_name outputname' exit(1) dir_path = sys.argv[1] f_input = dir_path+sys.argv[2] nlp= English() texts = [] stime = time.time() with codecs.open(f_input,'r','utf-8') as file: for line in file: line = line.strip() lineNo,sentence,tags,tags_er = line.split('\t') texts.append(lineNo+sentence) etime = time.time() print 'load tests time:',etime - stime pool = Pool(30) try: DT_result = [generateDT(doc) for doc in nlp.pipe(texts, n_threads=30, batch_size=100)] except: print 'read file exception'
def test_load(self): vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab'))
project_path, "data/reviews/review_16_{0}_with_term.train".format(genre)) test_data_file = os.path.join( project_path, "data/reviews/review_16_{0}_with_term.test".format(genre)) if genre == 'laptop': word2vec_model = '/Users/yinfei.yang/workspace/nlp/word2vec/models/vectors-reviews-electronics.bin' elif genre == 'restaurants': word2vec_model = '/Users/yinfei.yang/workspace/nlp/word2vec/models/vectors-reviews-restaurants.bin' w2v_model = Word2Vec.load_word2vec_format(word2vec_model, binary=True) data_helpers.load_data_and_term_labels(train_data_file, test_data_file) x_text_train, y_train_labels, x_text_test, y_test_labels, labels = \ data_helpers.load_data_and_term_labels(train_data_file, test_data_file) en = English() cd = 0 total = 0 for text, labels in zip(x_text_test, y_test_labels): doc = en(u'{0}'.format(text)) noun_chunks = [str(nc) for nc in doc.noun_chunks] #words = text.split() for label in labels: total += 1 label_term = label[1] flag = False
def __init__(self, num_threads=4): self.nlp = English(tokenizer=True, parser=True, tagger=True, entity=None, matcher=None)
def setTrainingVars(self, P, corp, num_topics, NTest, NTrain, lapp="", includeLabels=False): self.includeLabels = includeLabels self.T = NTest self.TRAIN = NTrain self.corpus = corp self.dfs = self.corpus.dfs() self.K = num_topics loc = lapp + "exports/" + P + "/lda_states/ldapy" + str(self.K) self.lda = models.ldamodel.LdaModel.load(loc) for z in range(0, self.K): topic = self.lda.state.get_lambda()[z] topic = topic / topic.sum() bestn = matutils.argsort(topic, 100, reverse=True) terms = [(id, topic[id]) for id in bestn] #terms = lda.get_topic_terms(z,100) for term in terms: word = corp.dictionary[term[0]].lower() weight = term[1] occurences = self.dfs[term[0]] #idf = log(corpus.documentCount/(1+occurences)) if word in self.wordweights: if weight > self.wordweights[word]: self.wordweights[word] = weight #* idf else: self.wordweights[word] = weight #* idf #print('\n\n') with open(lapp + "exports/" + P + "/good_ADJ.txt", "r") as f: for line in f: self.good_adjs.append(line.strip()) with open(lapp + "exports/" + P + "/bad_ADJ.txt", "r") as f: for line in f: self.bad_adjs.append(line.strip()) with open(lapp + "exports/" + P + "/good_NOUN.txt", "r") as f: for line in f: self.good_verbs.append(line.strip()) with open(lapp + "exports/" + P + "/bad_NOUN.txt", "r") as f: for line in f: self.bad_verbs.append(line.strip()) with open(lapp + "exports/" + P + "/featuresAprioriLexicalPruned.txt", "r") as f: for line in f: self.product_features.append(line.strip()) with open(lapp + "inputs/badwords.txt", "r") as f: for line in f: self.bad_words.append(line.decode('utf-8').strip()) self.currentGenerator = NTrain * 2 self.nnn = NTrain * 2 self.nlp = English() self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') self.d = enchant.Dict("en_US")
import codecs import cPickle from spacy.en import English import spacy tokenizer = English(parser=False) en_nlp = spacy.load('en') name = "semeval_metonymic_test" # Please specify the input file name. label = 1 # 1 is for METONYMY/NON-LITERAL, 0 is for LITERAL inp = codecs.open("data/" + name + ".txt", mode="r", encoding="utf-8") # PLEASE FORMAT THE INPUT FILE AS ONE SENTENCE PER LINE. SEE BELOW: # ENTITY<SEP>sentence<ENT>ENTITY<ENT>rest of sentence. # Germany<SEP>Their privileges as permanent Security Council members, especially the right of veto, # had been increasingly questioned by <ENT>Germany<ENT> and Japan which, as major economic powers. out = [] seq_length = 5 # A window of 5 is the DEFAULT for the PUBLICATION methodology. Feel free to experiment. def locate_entity(document, ent, left_w, right_w): left_w = '' if len(left_w) == 0 else left_w[-1].text right_w = '' if len(right_w) == 0 else right_w[0].text for doc in document: if doc.text == ent[0]: index = doc.i if left_w == '' or document[index - 1].text == left_w: if right_w == '' or document[index + len(ent)].text == right_w: return index + len(ent) - 1 raise Exception( ) # If this is ever triggered, there are problems parsing the text. Check SpaCy output!
def test_single_token_string(): nlp = English() tokens = nlp(u'foobar') assert tokens[0].string == 'foobar'
def __init__(self): self.nlp = English(parser=False, tagger=False, entity=False)
def __init__(self, additive=0, multiplicative=1): self.additive = additive self.multiplicative = multiplicative self.nlp = English(parser=False, tagger=False, entity=False)
file: word_container.py description: a vector storage datastructure for word vectors author: Luke de Oliveira ([email protected]) copyright: 2017 Vai Technologies, LLC. All Rights Reserved. """ import logging import numpy as np from spacy.en import English from .token_container import TokenContainer logger = logging.getLogger(__name__) NLP = English() def case(s, lower): if lower: return s.lower() return s class WordVectorBoxException(Exception): """ Errors for VectorBox. """ pass class WordContainer(TokenContainer): """docstring for CharacterContainer"""
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model', type=str, required=True) parser.add_argument('-weights', type=str, required=True) parser.add_argument('-results', type=str, required=True) args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') questions_val = open('../data/preprocessed/questions_val2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_val = open( '../data/preprocessed/questions_lengths_val2014.txt', 'r').read().decode('utf8').splitlines() answers_val = open('../data/preprocessed/answers_val2014.txt', 'r').read().decode('utf8').splitlines() images_val = open('../data/preprocessed/images_val2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' questions_lengths_val, questions_val, answers_val, images_val = ( list(t) for t in zip(*sorted( zip(questions_lengths_val, questions_val, answers_val, images_val)))) print 'Model compiled, weights loaded' labelencoder = joblib.load('../models/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'Loaded vgg features' image_ids = open('../features/coco/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print 'Loaded word2vec features' nb_classes = 1000 y_predict_text = [] batchSize = 128 widgets = [ 'Evaluating ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=widgets) for qu_batch, an_batch, im_batch in pbar( zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]), grouper(answers_val, batchSize, fillvalue=answers_val[0]), grouper(images_val, batchSize, fillvalue=images_val[0]))): timesteps = len(nlp( qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) if 'language_only' in args.model: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) X_batch = [X_q_batch, X_i_batch] y_predict = model.predict_classes(X_batch, verbose=0) y_predict_text.extend(labelencoder.inverse_transform(y_predict)) incorrect_val = 0 correct_val = 0 f1 = open(args.results, 'w') for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val): temp_count = 0 for _truth in truth.split(';'): if prediction == _truth: temp_count += 1 if temp_count > 2: correct_val += 1 else: incorrect_val += 1 f1.write(question.encode('utf-8')) f1.write('\n') f1.write(image.encode('utf-8')) f1.write('\n') f1.write(prediction) f1.write('\n') f1.write(truth.encode('utf-8')) f1.write('\n') f1.write('\n') f1.write('Final Accuracy is ' + str(float(correct_val) / (incorrect_val + correct_val))) f1.close() f1 = open('../results/overall_results.txt', 'a') f1.write(args.weights + '\n') f1.write(str(float(correct_val) / (incorrect_val + correct_val)) + '\n\n') f1.close() print 'Final Accuracy on the validation set is', float(correct_val) / ( incorrect_val + correct_val)
def getpatents(targetyear): timestart = time.time() getpatents_directory = os.getcwd() # get current working directory errorCount = 0 nlp = English() PATCOUNT_ORIGIN = [ 205, 260, 280, 264, 298, 301, 293, 282, 332, 315, 346, 311, 265, 326, 375, 309, 348, 339, 446, 490, 488, 628, 723, 827, 884, 968, 1002, 1084, 1304, 1482, 1648, 1843, 2251, 2928, 3639, 3958, 3623, 2927, 2047, 904, 99 ] store_chunks_FullParent = [] getpatents_directory_output = getpatents_directory + "/Patents/" if not os.path.exists(getpatents_directory_output): os.mkdir(getpatents_directory_output, 0o755) csvfile_PatNUM = open( '(03) SolarPV_41585 Patent List ORIGINAL with dssc patents 9501 v0.2 only num.csv', 'r') csvfile_ouput_by_year = open( getpatents_directory_output + str(targetyear) + '.csv', 'w+') reader_PatNO = csv.reader(csvfile_PatNUM, delimiter=' ', quotechar='|') writer_yearoutput = csv.writer(csvfile_ouput_by_year, delimiter=',', quotechar=',', quoting=csv.QUOTE_MINIMAL) PATNUM = [] splited_words = 0 sumstart = 0 normalcount = 0 for PatCountofYear in PATCOUNT_ORIGIN: # this for loop make PAT_HEADER set into right position. if splited_words >= targetyear - 1976: # e.g , if targetyaer is 1977, PAT_HEADER need to start from 206. so sumstart=205 and PAT HEADER started from 206. break else: sumstart += PatCountofYear splited_words += 1 PAT_HEADER = sumstart # if PAT_HEADER=n,PAT_HEADER pointed nth row exactly in reader_PatNO. for PATNO in reader_PatNO: PATNUM.append(PATNO[0]) # print(PATNUM[206]) while normalcount < PATCOUNT_ORIGIN[targetyear % 1976]: PAT_HEADER += 1 # HEADER가 어디선가 +1이 안되고있움. # print(PAT_HEADER) if PAT_HEADER == (sumstart + PATCOUNT_ORIGIN[targetyear % 1976] + 1): # HEADER value exceed valid range of year's patent count. break url = ''.join([ 'https://patents.google.com/patent/', PATNUM[PAT_HEADER - 1], '/en' ]) # row 1 in reader_PatNO is stored PATNUM[0]. print("\nURL NUMBER : " + str(PAT_HEADER) + " = " + url + "\n") urlText, backCitation, pubDate = readWEBSITE.getText(url) if urlText is None: # error occur at parsing patent. errorCount += 1 continue normalcount += 1 doc = nlp(urlText.decode('utf-8')) chunks_store = [] store_chunks_singlePatent = [] store_chunks_singlePatent.append(PAT_HEADER) for word in doc.noun_chunks: chunks_store.append(word) for span in chunks_store: store_str = span.text # get text part of span in chunks_store. splited_words = store_str.split() splited_word = [] # store_chunks_singlePatent.append() for splited_single_word in splited_words: stop_TF = False ########### Below down is temporary 'stop word list' ########## if splited_single_word == 'a': stop_TF = True if splited_single_word == 'A': stop_TF = True if splited_single_word == 'an': stop_TF = True if splited_single_word == 'An': stop_TF = True if splited_single_word == 'the': stop_TF = True if splited_single_word == 'The': stop_TF = True if splited_single_word == 'THE': stop_TF = True if splited_single_word == 'this': stop_TF = True if splited_single_word == 'This': stop_TF = True if splited_single_word == 'their': stop_TF = True if splited_single_word == 'Their': stop_TF = True if splited_single_word == 'Such': stop_TF = True if splited_single_word == 'such': stop_TF = True if splited_single_word == 'it': stop_TF = True if splited_single_word == 'It': stop_TF = True if splited_single_word == 'they': stop_TF = True if splited_single_word == 'They': stop_TF = True if splited_single_word == 'these': stop_TF = True if splited_single_word == 'These': stop_TF = True if stop_TF is True: # if word is in stopword list,check next word. continue else: splited_word.append(splited_single_word) combinedWord = " ".join(splited_word) # join word into one string. if combinedWord is "": # if string is null, string of word don't append into list. continue store_chunks_singlePatent.append( combinedWord ) # store_chunks_str_singlePatents store all of words used in particular PATENT. store_chunks_FullParent.append( store_chunks_singlePatent ) # store_chunks_str_FulParent store all of word used in all patent of targetyear. for row_input in store_chunks_FullParent: row_input = str(row_input) writer_yearoutput.writerow(row_input.split(",")) print("Error occur {0} times. Success {1} times\n".format( errorCount, normalcount)) csvfile_PatNUM.close() csvfile_ouput_by_year.close() timeend = time.time() print("it takes {0} sec for the get Patent text of {1}".format( (timeend - timestart), targetyear)) return None
import re import os import numpy as np import json import pickle import datetime import spacy from keras.utils import to_categorical from nltk.tokenize import word_tokenize, sent_tokenize from load_squad_wiki_data import get_squad_data, get_squad_wiki_data from gensim.models import Word2Vec from spacy.en import English nlp = spacy.load('en', parser = False, matcher = False, add_vectors = False) nlp_en = English() class MakeIter(object): def __init__(self, generator_func, **kwargs): self.generator_func = generator_func self.kwargs = kwargs def __iter__(self): return self.generator_func(**self.kwargs) class Embeddings: def __init__(self, size, window, min_count, workers): self.size = size self.window = window self.min_count = min_count self.workers = workers base_file_name = '_'.join([str(number) for number in [size, window, min_count, workers]]) self.path_word2vec_model = '../data/word2vec_model_{0}.pickle'.format(base_file_name) self.path_word_tokenized_sentence = '../data/word_tokenized_sentence_{0}.json'.format(base_file_name)
def main(filename, systemname, print_us, print_ont, statistics, link, prolog, per_role, threshold, base, weights): """General class to run the entire program """ # Initialize spaCy just once (this takes most of the time...) print("Initializing Natural Language Processor . . .") start_nlp_time = timeit.default_timer() nlp = English() nlp_time = timeit.default_timer() - start_nlp_time start_parse_time = timeit.default_timer() miner = StoryMiner() # Read the input file set = Reader.parse(filename) us_id = 1 # Keep track of all errors success = 0 fail = 0 list_of_fails = [] errors = "" c = Counter() # Keeps track of all succesfully created User Stories objects us_instances = [] failed_stories = [] # Parse every user story (remove punctuation and mine) for s in set: try: user_story = parse(s, us_id, systemname, nlp, miner) user_story = c.count(user_story) success = success + 1 us_instances.append(user_story) except ValueError as err: failed_stories.append([us_id, s, err.args]) errors += "\n[User Story " + str(us_id) + " ERROR] " + str( err.args[0]) + "! (\"" + " ".join(str.split(s)) + "\")" fail = fail + 1 us_id = us_id + 1 # Print errors (if found) if errors: Printer.print_head("PARSING ERRORS") print(errors) parse_time = timeit.default_timer() - start_parse_time # Generate the term-by-user story matrix (m), and additional data in two other matrices start_matr_time = timeit.default_timer() matrix = Matrix(base, weights) matrices = matrix.generate(us_instances, ''.join(set), nlp) m = matrices[0] count_matrix = matrices[1] stories_list = matrices[2] rme = matrices[3] matr_time = timeit.default_timer() - start_matr_time # Print details per user story, if argument '-u'/'--print_us' is chosen if print_us: print("Details:\n") for us in us_instances: Printer.print_us_data(us) # Generate the ontology start_gen_time = timeit.default_timer() patterns = Constructor(nlp, us_instances, m) out = patterns.make(systemname, threshold, link) output_ontology = out[0] output_prolog = out[1] output_ontobj = out[2] output_prologobj = out[3] onto_per_role = out[4] # Print out the ontology in the terminal, if argument '-o'/'--print_ont' is chosen if print_ont: Printer.print_head("MANCHESTER OWL") print(output_ontology) gen_time = timeit.default_timer() - start_gen_time # Gather statistics and print the results stats_time = 0 if statistics: start_stats_time = timeit.default_timer() statsarr = Statistics.to_stats_array(us_instances) Printer.print_head("USER STORY STATISTICS") Printer.print_stats(statsarr[0], True) Printer.print_stats(statsarr[1], True) Printer.print_subhead( "Term - by - User Story Matrix ( Terms w/ total weight 0 hidden )") hide_zero = m[(m['sum'] > 0)] print(hide_zero) stats_time = timeit.default_timer() - start_stats_time # Write output files w = Writer() folder = "output/" + str(systemname) reports_folder = folder + "/reports" stats_folder = reports_folder + "/stats" outputfile = w.make_file(folder + "/ontology", str(systemname), "omn", output_ontology) files = [["Manchester Ontology", outputfile]] outputcsv = "" sent_outputcsv = "" matrixcsv = "" if statistics: outputcsv = w.make_file(stats_folder, str(systemname), "csv", statsarr[0]) matrixcsv = w.make_file(stats_folder, str(systemname) + "-term_by_US_matrix", "csv", m) sent_outputcsv = w.make_file(stats_folder, str(systemname) + "-sentences", "csv", statsarr[1]) files.append(["General statistics", outputcsv]) files.append(["Term-by-User Story matrix", matrixcsv]) files.append(["Sentence statistics", sent_outputcsv]) if prolog: outputpl = w.make_file(folder + "/prolog", str(systemname), "pl", output_prolog) files.append(["Prolog", outputpl]) if per_role: for o in onto_per_role: name = str(systemname) + "-" + str(o[0]) pont = w.make_file(folder + "/ontology", name, "omn", o[1]) files.append(["Individual Ontology for '" + str(o[0]) + "'", pont]) # Print the used ontology generation settings Printer.print_gen_settings(matrix, base, threshold) # Print details of the generation Printer.print_details(fail, success, nlp_time, parse_time, matr_time, gen_time, stats_time) report_dict = { "stories": us_instances, "failed_stories": failed_stories, "systemname": systemname, "us_success": success, "us_fail": fail, "times": [[ "Initializing Natural Language Processor (<em>spaCy</em> v" + pkg_resources.get_distribution("spacy").version + ")", nlp_time ], ["Mining User Stories", parse_time], ["Creating Factor Matrix", matr_time], ["Generating Manchester Ontology", gen_time], ["Gathering statistics", stats_time]], "dir": os.path.dirname(os.path.realpath(__file__)), "inputfile": filename, "inputfile_lines": len(set), "outputfiles": files, "threshold": threshold, "base": base, "matrix": matrix, "weights": m['sum'].copy().reset_index().sort_values( ['sum'], ascending=False).values.tolist(), "counts": count_matrix.reset_index().values.tolist(), "classes": output_ontobj.classes, "relationships": output_prologobj.relationships, "types": list(count_matrix.columns.values), "ontology": Utility.multiline(output_ontology) } # Finally, generate a report report = w.make_file(reports_folder, str(systemname) + "_REPORT", "html", generate_report(report_dict)) files.append(["Report", report]) # Print the location and name of all output files for file in files: if str(file[1]) != "": print( str(file[0]) + " file succesfully created at: \"" + str(file[1]) + "\"")
class Sentence: # SCORENLP = StanfordCoreNLP("/Users/rajpav/anaconda2/lib/python2.7/site-packages/stanford-corenlp-full-2016-10-31") SCORENLP = StanfordCoreNLP( "/Users/acharya.n/anaconda2/lib/python2.7/stanford-corenlp-full-2016-10-31" ) TEXT_LEMMA_PATTERN = re.compile('(\[{1})([a-zA-Z0-9.= $_<>\"\/?]+)(\]{1})') PARTS_OF_SPEECH_PATTERN = re.compile( '(\({1})([a-zA-Z0-9.= $_<>\-\"\/?]+)(\){1})') NON_ALLOWED_NOUN_CHUNKS = ["how", "many", "much"] STRING_TO_DICT_PATTERN = re.compile(r'(\S+)=(".*?"|\S+)') SINGULAR_PRONOUN = ['he', 'she', 'it', 'him', 'her', 'his'] SINGULAR_SUBJECT_PRONOUN = ['he', 'she', 'him', 'her', 'his'] SINGULAR_OBJECT_PRONOUN = ['it'] PLURAL_PRONOUN = ['they', 'them'] LEMMATIZER_MODULE = LemmatizerModule() SPACY_PARSER = English() def __init__(self, sentence_json, question, sentence_index): self.m_predicted_label = sentence_json["PredictedLabel"] self.m_sentence_text = sentence_json["Sentence"] self.m_syntactic_pattern = sentence_json["SyntacticPattern"] ###print self.m_sentence_text self.m_has_a_cardinal = False self.m_cardinal = None self.m_has_a_dobj = False self.m_dobj = None self.m_has_a_nsubj = False self.m_nsubj = None self.m_pobj = None self.m_has_a_pobj = False self.m_quantified_entity = None self.m_owner_entity = None self.m_object_entity = None self.m_evaluating_subject = None self.m_evaluating_object = None self.m_has_a_pronoun = False self.m_processed_pronoun = None self.m_transfer_entity = None self.m_transfer_quantified_entity = None self.m_all_pronouns = [] self.m_all_nouns = [] self.m_all_noun_lemmas = [] self.m_question = question self.m_words_pos = OrderedDict() self.m_is_first_word_an_expletive = True if self.m_syntactic_pattern[ 0] == 'E' else False self.m_expletive_index = -1 if 'E' in self.m_syntactic_pattern: self.m_expletive_index = self.m_syntactic_pattern.index('E') self.m_sentence_index = sentence_index self.m_is_pronoun_noun_found = False self.m_current_pronoun = None self.temp_transfer_entity = None self.temp_dobj = None self.m_has_an_unknown_quantity = False self.m_possible_evaluating_subjects = [] self.m_possible_evaluating_object = None self.m_question_label = None self.m_complex_nouns = [] self.m_sentece_words = [] self.m_words_index = {} self.m_compound_modifiers = [] question_label_string = "QuestionLabel" if self.m_predicted_label == '?' and question_label_string in sentence_json: self.m_question.m_evaluating_sentence = self self.m_question_label = sentence_json["QuestionLabel"] def __str__(self): return self.m_sentence_text def parse_sentence(self): self.extract_dependencies() # self.process_pronouns() if self.m_predicted_label == '?': self.extract_evaluation_entities() else: self.extract_entities() def extract_dependencies(self): print 'in extract dep' print self.m_sentence_text corenlp_result = json.loads( Sentence.SCORENLP.parse(self.m_sentence_text)) current_sentence = corenlp_result["sentences"][0] parse_tree = current_sentence["parsetree"] # print 'parse_tree',parse_tree self.m_dependencies = current_sentence["dependencies"] # self.m_matched_tuples = Sentence.TEXT_LEMMA_PATTERN.findall(parse_tree) # print 'matched tuples',self.m_matched_tuples # print self.m_dependencies self.m_matched_pos = Sentence.PARTS_OF_SPEECH_PATTERN.findall( parse_tree) # print self.m_matched_pos index_counter = 0 for matched_pos in self.m_matched_pos: index_counter = index_counter + 1 word_pos = matched_pos[1].split(" ") parts_of_speech = word_pos[0] word = word_pos[1].lower() # #print word self.m_words_index[word] = index_counter self.m_sentece_words.append(word) self.m_words_pos[word] = parts_of_speech if parts_of_speech in PublicKeys.NOUN_POS: lemma = Sentence.LEMMATIZER_MODULE.lemmatize(word) self.m_all_noun_lemmas.append(lemma) self.m_all_nouns.append(word) if parts_of_speech == 'NNP': self.m_question.add_proper_noun(word) if parts_of_speech == 'CD': self.m_has_a_cardinal = True if self.m_expletive_index != -1: self.m_is_first_word_an_expletive = True try: # #print 1 # #print word # #print 2 # #print float(word) # #print 3 # #print str(float(word)) self.m_cardinal = Decimal(word) # #print self.m_cardinal except: self.m_cardinal = PublicKeys.text2int(word) # #print self.m_cardinal self.m_words_index[str(self.m_cardinal)] = index_counter # #print 'insert' # #print self.m_words_index[str(self.m_cardinal)] if self.m_predicted_label == '-': self.m_cardinal = -self.m_cardinal elif parts_of_speech == 'PRP' or parts_of_speech == 'PRP$': ###print 'found pronoun' self.m_has_a_pronoun = True self.m_all_pronouns.append(word) ###print self.m_is_pronoun_noun_found if self.m_is_pronoun_noun_found == False: ###print 'In sentence' ###print self.m_sentence_index ###print self.m_question.m_coref_dict ###print self.m_question.m_coref_dict[self.m_sentence_index] current_sentence_coref_dict = self.m_question.m_coref_dict[ self.m_sentence_index] ###print 'pronoun not found yet' + word ###print current_sentence_coref_dict if word in current_sentence_coref_dict: ###print 'word in dict true' current_pronoun_noun = current_sentence_coref_dict[ word] ###print 'current_pronoun_noun' + current_pronoun_noun ###print self.m_question.m_proper_nouns if current_pronoun_noun.lower( ) in self.m_question.m_proper_nouns: self.m_processed_pronoun = current_pronoun_noun self.m_is_pronoun_noun_found = True self.m_current_pronoun = word ###print "Pronoun Noun :" + self.m_processed_pronoun if (self.m_predicted_label == '-' or self.m_predicted_label == '+' or self.m_predicted_label == '=') and self.m_has_a_cardinal == False: self.m_has_a_cardinal = True self.m_cardinal = 'X' self.m_has_an_unknown_quantity = True def extract_entities(self): print 'in extract entities' sentence_parse = Sentence.SPACY_PARSER(self.m_sentence_text) spacy_subj = None temp_pobj = None for token in sentence_parse: token_dep = token.dep_ print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights]) if token_dep == 'pobj': temp_pobj = token elif token_dep == 'nsubj' or token_dep == 'nsubjpass': spacy_subj = token.orth_.lower() elif token_dep == 'poss': self.assign_poss_entities(token) elif token_dep == 'compound' or token_dep == 'amod': print 'in compound and amod case' modifier = Sentence.LEMMATIZER_MODULE.lemmatize(token.orth_) compound_dobj = Sentence.LEMMATIZER_MODULE.lemmatize( token.head.orth_) compound_modifier = CompoundModifier(modifier, compound_dobj) print 'found compound modifier:', modifier, compound_dobj self.m_compound_modifiers.append(compound_modifier) self.m_complex_nouns.append(modifier + " " + compound_dobj) # self.temp_dobj = compound_dobj sentence_svos = findSVOs(sentence_parse) print "svos", sentence_svos, len(sentence_svos) if len(sentence_svos) > 0: transfer_entity_relation = None # #print 'starts with an expl:',self.m_is_first_word_an_expletive if self.m_is_first_word_an_expletive == False: print 'svo' print sentence_svos[0][0] print sentence_svos[0][2] # trying to assign subj and obj from svo self.assign_nsubj(sentence_svos[0][0]) self.assign_dobj(sentence_svos[0][2]) print 'after trying to assign subj', self.m_nsubj print 'after trying to assign dobj:' print 'dobj exists?:', self.m_has_a_dobj print 'dobj:', self.m_dobj print 'temp dobj:', self.temp_dobj #print temp_pobj if self.m_has_a_dobj == False: if self.temp_dobj != None: print 'before temp dobj' self.assign_dobj(self.temp_dobj) if self.temp_transfer_entity != None: self.assign_transfer_entity( self.temp_transfer_entity, 'dobj') elif temp_pobj != None: print 'before temp pobj' self.assign_dobj(temp_pobj.orth_.lower()) #self.assign_dobj(self.m_pobj, 'pobj') self.assign_transfer_entity(sentence_svos[0][2], 'dobj') elif temp_pobj != None: print 'in temp dobj != None' self.assign_transfer_entity(temp_pobj.orth_.lower(), 'pobj') elif self.temp_transfer_entity != None: print 'in temp transfer entity !- None' self.assign_transfer_entity(self.temp_transfer_entity, 'poss') else: # #print 'before 2nsd svo' self.assign_dobj(sentence_svos[0][2]) if temp_pobj != None: self.assign_nsubj(temp_pobj.orth_.lower()) ###print 'before calling extract quantified' self.extract_quantified_entities(True, transfer_entity_relation) elif spacy_subj != None and temp_pobj != None: self.temp_dobj = temp_pobj.orth_ print 'In spacy' #print self.temp_dobj self.assign_nsubj(spacy_subj) self.assign_dobj(self.temp_dobj) self.extract_quantified_entities(False, None) elif spacy_subj != None and self.m_question.m_question_label != 'c': # print 'spacy_subj is not none' self.assign_dobj(spacy_subj) self.extract_quantified_entities(False, None) elif self.m_question.m_question_label == 'c': if self.m_has_a_cardinal: print 'found nothing should do something.' quantified_non_entity = QuantifiedNonEntity(self.m_cardinal) if spacy_subj != None: self.assign_nsubj(spacy_subj) quantified_non_entity.set_owner_entity(self.m_owner_entity) self.m_question.add_quantified_non_entity( quantified_non_entity) def assign_nsubj(self, subj): self.m_has_a_nsubj = True self.m_nsubj = subj if self.m_nsubj in self.m_all_pronouns: self.m_nsubj = self.m_processed_pronoun self.m_owner_entity = Entity('nsubj', self.m_nsubj) def assign_dobj(self, dobj): # #print dobj #print 'in assigning dobj' is_dobj_integer = self.is_integer(dobj) if is_dobj_integer == False and dobj not in self.m_question.m_proper_nouns: self.m_has_a_dobj = True self.m_dobj = dobj elif dobj in self.m_question.m_proper_nouns: self.temp_transfer_entity = dobj if self.temp_transfer_entity in self.m_all_pronouns: self.temp_transfer_entity = self.m_processed_pronoun elif is_dobj_integer == True: for k, v in self.m_question.m_quantified_entities.items(): if self.m_has_a_dobj: break for e in v: print 'assigning cardianl d object' self.m_has_a_dobj = True self.m_dobj = unicode(e.get_name()) self.m_words_pos[e.get_name()] = 'NN' self.m_words_index[e.get_name()] = self.m_words_index[dobj] break def assign_pobj(self, token): token_orth = token.orth_.lower() ###print token_orth ###print self.m_question.get_quantified_entities() ###print self.m_question.get_quantified_entity_objects() if token_orth in self.m_question.get_quantified_entities(): ###print 'assigning pobj' self.m_pobj = token_orth if self.m_pobj in self.m_all_pronouns: self.m_pobj = self.m_processed_pronoun self.m_has_a_pobj = True #elif token_orth in self.m_question.get_quantified_entity_objects(): else: self.temp_dobj = token_orth def assign_poss_entities(self, token): self.temp_transfer_entity = token.orth_.lower() self.temp_dobj = token.head.orth_.lower() def assign_transfer_entity(self, val, pos): if val != None: val = val.lower() ##print 'in assign transfer entity:' + val if val in self.m_all_pronouns: val = self.m_processed_pronoun self.m_transfer_entity = Entity(pos, val) def extract_normal_entities(self): transfer_entity_relation = None for dependency in self.m_dependencies: relation = dependency[0] if relation == 'nsubj': self.m_has_a_nsubj = True self.m_nsubj = dependency[2] self.m_owner_entity = Entity('nsubj', self.m_nsubj) elif relation == 'dobj': self.m_has_a_dobj = True self.m_dobj = dependency[2] elif relation == 'nmod:to' or relation == 'nmod:from' or relation == 'nmod:poss' or relation == 'iobj': transfer_entity_relation = relation if self.m_has_a_pronoun: self.m_transfer_entity = Entity( relation, unicode(self.m_processed_pronoun, "utf-8")) else: self.m_transfer_entity = Entity(relation, dependency[2]) self.extract_quantified_entities(True, transfer_entity_relation) def extract_quantified_entities(self, to_create_transfer_entity, transfer_entity_relation): ##print self.m_transfer_entity # ##print self.m_owner_entity print 'in extract quantified entities' if self.m_cardinal != None and self.m_has_an_unknown_quantity == False: self.validate_dobj_index() print 'in cardinal case and no unknown quantity' # #print self.m_dobj lemmatized_dobj = Sentence.LEMMATIZER_MODULE.lemmatize(self.m_dobj) compound_modifier = self.get_compound_modifier_for_dobj( self.m_dobj) if self.m_owner_entity != None: print 'owner entity not none' ##print self.m_dobj ##print type(self.m_dobj) ##print self.m_dobj.lower() owner_modified_cardinal = self.m_cardinal if self.m_has_an_unknown_quantity: if self.m_predicted_label == '-': owner_modified_cardinal = "-" + self.m_cardinal transfer_transaction_cardinal = self.m_cardinal else: owner_modified_cardinal = self.m_cardinal transfer_transaction_cardinal = "-" + self.m_cardinal else: transfer_transaction_cardinal = -self.m_cardinal print 'after calc transfer cardinal:', transfer_transaction_cardinal temp_quantified_entity = QuantifiedEntity( owner_modified_cardinal, 'dobj', lemmatized_dobj, False) temp_quantified_entity.set_owner_entity(self.m_owner_entity) transfer_transaction = TransferTransaction( to_create_transfer_entity, self.m_transfer_entity, lemmatized_dobj, transfer_transaction_cardinal) temp_quantified_entity.add_transfer_transaction( transfer_transaction) print 'after merging', compound_modifier if compound_modifier != None: print 'modifier quantity,' compound_modifier.set_quantity(owner_modified_cardinal) temp_quantified_entity.add_compound_modifier( compound_modifier) for k, v in self.m_question.m_quantified_entities.items(): for e in v: print 'comparisons:', e.get_name( ), compound_modifier.m_dobj if e.get_name() == compound_modifier.m_dobj: print 'adding compoung modifier' e.add_compound_modifier(compound_modifier) merge_entities = self.get_or_merge_entity( temp_quantified_entity, transfer_transaction) # self.m_quantified_entity = temp_quantified_entity if merge_entities == True else None else: self.m_owner_entity = Entity("global", u"global") global_modified_cardinal = self.m_cardinal if self.m_has_an_unknown_quantity: if self.m_predicted_label == '-': global_modified_cardinal = "-" + self.m_cardinal else: global_modified_cardinal = self.m_cardinal elif global_modified_cardinal < 0: global_modified_cardinal = -global_modified_cardinal temp_quantified_entity = QuantifiedEntity( global_modified_cardinal, 'dobj', lemmatized_dobj, False) temp_quantified_entity.set_owner_entity(self.m_owner_entity) merge_entities = self.get_or_merge_entity( temp_quantified_entity, None) # self.m_quantified_entity = temp_quantified_entity if merge_entities == True else None if to_create_transfer_entity and self.m_transfer_entity != None: ##print 'creating transfer entity' transfer_modified_cardinal = self.m_cardinal if self.m_has_an_unknown_quantity: if self.m_predicted_label == '+': transfer_modified_cardinal = "-" + self.m_cardinal transfer_transaction_cardinal = self.m_cardinal else: transfer_modified_cardinal = self.m_cardinal transfer_transaction_cardinal = "-" + self.m_cardinal else: transfer_modified_cardinal = -self.m_cardinal transfer_transaction_cardinal = self.m_cardinal ##print transfer_modified_cardinal temp_transfer_quantified_entity = QuantifiedEntity( transfer_modified_cardinal, transfer_entity_relation, lemmatized_dobj, True) temp_transfer_quantified_entity.set_owner_entity( self.m_transfer_entity) transfer_transaction = TransferTransaction( to_create_transfer_entity, self.m_owner_entity, lemmatized_dobj, transfer_transaction_cardinal) temp_transfer_quantified_entity.add_transfer_transaction( transfer_transaction) to_merge_transfer_entity = self.get_or_merge_entity( temp_transfer_quantified_entity, transfer_transaction) self.m_transfer_quantified_entity = temp_transfer_quantified_entity if to_merge_transfer_entity == True else None else: self.m_object_entity = Entity('dobj', self.m_dobj) def get_compound_modifier_for_dobj(self, dobj): dobj = Sentence.LEMMATIZER_MODULE.lemmatize(dobj) print 'In compound modifier for dobj', dobj, len( self.m_compound_modifiers) compound_modifier = None for modifier in self.m_compound_modifiers: print 'modifier dobj', modifier.m_dobj if dobj == modifier.m_dobj: compound_modifier = modifier break return compound_modifier def validate_dobj_index(self): num = self.m_cardinal # #print self.m_words_index # #print num # #print self.m_dobj if num < 0: num = -num if self.m_dobj == None: dobj_index = 0 else: if self.m_dobj.lower() in self.m_words_index: dobj_index = self.m_words_index[self.m_dobj.lower()] else: dobj_index = 0 dobj_lower = self.m_dobj.lower() # print 'pos before prp',self.m_words_pos # print 'dobj before prp',self.m_dobj if self.m_words_pos[self.m_dobj.lower( )] == 'PRP' or self.m_words_pos[dobj_lower] == 'PRP$': for k, v in self.m_question.m_quantified_entities.items(): if self.m_has_a_dobj: break for e in v: #print 'assigning pronoun object' self.assign_dobj(unicode(e.get_name())) break cardinal_index = self.m_words_index[str(num)] if str( num) in self.m_words_index else self.m_words_index[str(int(num))] if dobj_index < cardinal_index: current_possible_obj = None to_consider_for_objects = [] for current_word in self.m_words_index: current_word_index = self.m_words_index[current_word] if current_word_index > cardinal_index and ( current_word in self.m_all_nouns or current_word in self.m_all_pronouns): current_possible_obj = Sentence.LEMMATIZER_MODULE.lemmatize( current_word) break if current_possible_obj != None: self.assign_dobj(unicode(current_possible_obj)) def get_or_merge_entity(self, temp_entity, transfer_transaction): to_merge_entities = self.m_question.add_quantified_entity(temp_entity) print 'to merge?' ##print to_merge_entities if to_merge_entities: self.merge_entities(temp_entity, transfer_transaction) elif self.m_predicted_label == '=': temp_entity.flip_equal_to_state() return to_merge_entities def merge_entities(self, temp_quantified_entity, transfer_transaction): ##print "in merge" quantified_entities = self.m_question.get_quantified_entities() subject = temp_quantified_entity.get_owner_entity().get_name() #sentence_output = self.output(True) sentence_output = temp_quantified_entity.get_cardinal() subject_quantified_entities = quantified_entities[subject] for subject_quantified_entity in subject_quantified_entities: if subject_quantified_entity.get_name( ) == temp_quantified_entity.get_name(): if self.m_predicted_label == '=': subject_quantified_entity.set_equal_to_state( sentence_output) else: subject_quantified_entity.perform_operation( sentence_output, self.m_has_an_unknown_quantity, transfer_transaction) ##print subject_quantified_entity def extract_evaluation_entities(self): sentence_parse = Sentence.SPACY_PARSER(self.m_sentence_text) # print "in extract evaluating entities" # print sentence_parse for token in sentence_parse: if token.dep_ == 'compound' or token.dep_ == 'amod': print 'in compound and amod case' modifier = Sentence.LEMMATIZER_MODULE.lemmatize(token.orth_) compound_dobj = Sentence.LEMMATIZER_MODULE.lemmatize( token.head.orth_) compound_modifier = CompoundModifier(modifier, compound_dobj) print 'found compound modifier:', modifier, compound_dobj self.m_compound_modifiers.append(compound_modifier) self.m_complex_nouns.append( Sentence.LEMMATIZER_MODULE.lemmatize(token.orth_) + " " + Sentence.LEMMATIZER_MODULE.lemmatize(token.head.orth_)) ##print 'In extract evaluating entities' # noun_chunks = self.get_noun_chunks(self.m_sentence_text) # if self.m_is_pronoun_noun_found == True: ##print self.m_processed_pronoun # for index, val in enumerate(noun_chunks): # val_lower_unicode = val.lower() # for word in Sentence.NON_ALLOWED_NOUN_CHUNKS: # if word in val_lower_unicode: # val_lower_unicode = val_lower_unicode.replace(word,'') # # noun_chunks[index] = val_lower_str.replace(word,'') # ##print noun_chunks # ##print 'Before assigning chunk' # ##print val_lower_unicode # noun_chunks[index] = val_lower_unicode.strip() ##print "after removing non allowed chunks: ", noun_chunks # chunk_split = val.split() # if len(chunk_split) > 1: # sentence_split = self.m_sentence_text.split() # lemma_sentence = '' # for sentence_split_word in sentence_split: # lemma_sentence = lemma_sentence + ' ' + Sentence.LEMMATIZER_MODULE.lemmatize(sentence_split_word) # ##print lemma_sentence # noun_chunks = self.get_noun_chunks(lemma_sentence) # for index, val in enumerate(noun_chunks): # if val == self.m_current_pronoun: # noun_chunks[index] = self.m_processed_pronoun # noun_chunks[index] = Sentence.LEMMATIZER_MODULE.lemmatize(unicode(noun_chunks[index])).lower() ##print 'After lemmatizing and pronoun replacement' ##print noun_chunks # for noun in noun_chunks: # if noun in self.m_question.get_quantified_entities(): # self.m_possible_evaluating_subjects.append(noun) # elif self.m_possible_evaluating_object == None: # self.m_possible_evaluating_object = noun ##print 'possible subjects' ##print self.m_possible_evaluating_subjects ##print 'possible object' ##print self.m_possible_evaluating_object # for dependency in self.m_dependencies: # if dependency[0] == 'nsubj': # self.m_has_a_nsubj = True # self.m_nsubj = dependency[2] # self.m_evaluating_subject = Entity('nsubj', self.m_nsubj) # elif dependency[0] == 'dobj': # # extract parts of speech of the relation dep and gov # # if none of them is noun. apply some logic to find the evaluating object # ##print self.m_words_pos # temp_dobj = dependency[2] # temp_dobj_pos = self.m_words_pos[temp_dobj] # if temp_dobj_pos != None and temp_dobj_pos in PublicKeys.NOUN_POS: # self.m_has_a_dobj = True # self.m_dobj = dependency[2] # self.m_evaluating_object = Entity('dobj', self.m_dobj) # else: # ##print 'Couldn\'t find a dobj noun' # max = 0 # matching_noun = None # for noun in self.m_all_noun_lemmas: # for qes in self.m_question.get_quantified_entities().values(): # for qe in qes: # wup_similarity = self.word_similarity(noun, qe.get_name()) # if max < wup_similarity: # max = wup_similarity # matching_noun = qe # # self.m_evaluating_object = Entity('dobj', matching_noun.get_name()) def get_noun_chunks(self, text): response = unirest.post( "https://textanalysis.p.mashape.com/spacy-noun-chunks-extraction", headers={ "X-Mashape-Key": "KRSu5yA8domshWMHNzhofCid2f3fp1aOWWsjsnuS3zN7CYN9Kq", "Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json" }, params={"text": text}) ##print response.body ##print response.raw_body response_json = json.loads(response.raw_body) # print 'response:',response_json ##print response_json["result"] return response_json["result"] def word_similarity(self, word1, word2): xx = wn.synsets(word1, pos=wn.NOUN) yy = wn.synsets(word2, pos=wn.NOUN) max = 0 for x in xx: for y in yy: wup_similarity = x.wup_similarity(y) max = wup_similarity if max < wup_similarity else max return max def is_integer(self, val): is_integer = True try: dummy = int(val) except: is_integer = False return is_integer def extract_result(self): ##print 'In extract result' quantified_entities = self.m_question.get_quantified_entities() result = None if self.m_question_label == 'all': return QuestionSentenceSolver.solve_for_all_label(self) elif self.m_question_label == '+': return QuestionSentenceSolver.solve_for_plus_label(self) elif self.m_question_label == 'c': return ComparisonSentenceSolver.solve_for_c_label(self) elif self.m_question_label == 'b': return ButConjunctionSentenceSolver.solve_for_but_label(self) elif self.m_question_label == 'u': return UnknownSentenceSolver.solve_for_unknown_label(self) else: return None # if len(self.m_possible_evaluating_subjects) == 1: # subject = self.m_possible_evaluating_subjects[0] # if subject in quantified_entities: # subjects_object_entities = quantified_entities[subject] # # for subjects_object_entity in subjects_object_entities: # ##print 'during comparison' # ##print subjects_object_entity # ##print self.m_possible_evaluating_object # if subjects_object_entity.get_name() == self.m_possible_evaluating_object: # result = subjects_object_entity # break # return result # subjects_object_entities = quantified_entities[self.m_evaluating_subject.get_name()] # result = None # ##print subjects_object_entities # for subjects_object_entity in subjects_object_entities: # ##print subjects_object_entity # ##print self.m_evaluating_object # if subjects_object_entity.get_name() == self.m_evaluating_object.get_name(): # result = subjects_object_entity # break # return result def process_pronouns(self): ##print 'process pronouns' if self.m_has_a_pronoun == True: singular_pronouns = [] plural_pronouns = [] nouns = self.m_question.get_quantified_entities().keys() for pronoun_tuple in self.m_all_pronouns: pronoun = pronoun_tuple["Text"].lower() if pronoun in Sentence.SINGULAR_PRONOUN: singular_pronouns.append(pronoun_tuple) for noun in reversed(nouns): ##print 'found' + noun self.m_processed_pronoun = noun break elif pronoun in Sentence.PLURAL_PRONOUN: self.sum_all_entities() # def sum_all_entities(self): ##print "do something" def output(self, ret_math_value): if ret_math_value == True: output = self.m_cardinal else: if self.m_predicted_label == '+' or self.m_predicted_label == '-': if self.m_cardinal != None: output = self.m_quantified_entity else: output = self.m_predicted_label + ' ' + 'X' return output
def test_period(): EN = English() tokens = EN.tokenizer('best.Known') assert len(tokens) == 3 tokens = EN('zombo.com') assert len(tokens) == 1
def EN(): return English()
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))
import time start = time.time() pt.train(sentences_w_tags, nr_iter=5) end = time.time() print "time taken = " + str(end - start) # In[2]: import os from spacy.en import English, LOCAL_DATA_DIR, DOC data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) nlp = English(parser=False, entity=False, data_dir=data_dir) # In[17]: corpus = "\n".join(" ".join(y) for y in [x[0] for x in sentences_w_tags]) doc = nlp(unicode(corpus)) # In[36]: n = 0 print doc[n].lemma_ print doc[n].pos_
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager)
from collections import OrderedDict from spacy.en import English # NLP with spaCy https://spacy.io nlp = English() # will take some time to load # Useful properties, summary of the docs from https://spacy.io # class Doc # properties: text, vector, vector_norm, ents, noun_chunks, sents # method: similarity # NER specs https://spacy.io/docs#annotation-ner # doc tokenization will preserve meaningful units together # class Token # token.doc -> parent sequence # string features: text, lemma, lower, shape # boolean flags: https://spacy.io/docs#token-booleanflags # POS: pos_, tag_ # tree: https://spacy.io/docs#token-navigating # ner: ent_type, ent_iob # class Span # span.doc -> parent sequence # vector, vector_norm # string features: text, lemma # methods: similarity # syntactic parse: use root, lefts, rights, subtree # https://spacy.io/docs#span-navigativing-parse # !more to implement: # also filter to prepare for tree # syntactic parse tree https://spacy.io/docs#span-navigativing-parse
class SpacyEventExtractor: _nlp = English() _keywords = list(map(lambda s: s.strip().lower(), open('keywords.txt', 'r').readlines())) _known_phrases = [('is out', 'released'), ('is here', 'released'), ('is there', 'released'), ('is out', 'released'), ('is open', 'started'), ('is available', 'released'), ('please welcome', 'we released') ] _important_actions = ['release', 'start', 'publish', 'announce', 'update'] def __init__(self): pass @staticmethod def _have_pronouns(text: str) -> bool: pronouns = ['i', 'you', 'he', 'she', 'they', 'be', 'him', 'her', 'it'] # 'we' is a good pronoun as it refers to a company return list(filter(lambda s: s.lower() in pronouns, text.split())) != [] @staticmethod def _is_present_simple(verb: spacy.tokens.Token) -> bool: for child in verb.children: if child.orth_ == 'will': return False # will have etc lemma = verb.lemma_.lower() if verb.orth_.lower() in [lemma, lemma + 's', lemma + 'es', 'have', 'has', 'do', 'is', 'are']: return True return False @staticmethod def _is_present_continuous(verb: spacy.tokens.Token) -> bool: for child in verb.children: if child.dep_ == 'aux' and child.lemma_ not in ['be', 'is', 'are', 'am']: return False # will have etc return verb.orth_.endswith('ing') @staticmethod def _get_tree(root: spacy.tokens.Token, depth: int, token_filter: types.FunctionType) -> [spacy.tokens.Token]: """Get list of tokens dependent on given root and satisfying given token_filter""" if depth == 0: return [root] if token_filter(root) else [] result = [] # for tokens on the left of the root, whose head is root for child in filter(token_filter, root.lefts): result += SpacyEventExtractor._get_tree(child, depth - 1, token_filter) result.append(root) # for tokens on the right of the root, whose head is root for child in filter(token_filter, root.rights): result += SpacyEventExtractor._get_tree(child, depth - 1, token_filter) return result @staticmethod def _get_chunk(token: spacy.tokens.Token) -> str: """Get string representation of a chunk. Chunk is one or more tokens that forms semantic unit. For example, compound tokens or tokens with dependent tokens.""" if token is None: return "" def token_filter(tok): """True for various modifiers of tok and compound tokens, which include tok""" return tok is token or \ tok.dep_.endswith("mod") or \ tok.dep_ == "compound" tree = SpacyEventExtractor._get_tree(root=token, depth=2, token_filter=token_filter) return " ".join(map(str, tree)) @staticmethod def _get_prep_with_word(token: spacy.tokens.Token) -> (str, spacy.tokens.Token): """Get prepositional modifiers of the token and important perposition's child""" if token is None: return "", None prep = None # search of prepositions for child in token.rights: if child.dep_ == "prep": prep = child break if prep is None: return "", None for word in prep.children: # if preposition has child of type 'object of preposition' or 'complement of a preposition' # then add it to the result if word.dep_ in ["pobj", "pcomp"]: chunk_str = SpacyEventExtractor._get_chunk(word) return str(prep) + " " + chunk_str, word return "", None @staticmethod def _get_full_entity(entity: spacy.tokens.Token) -> str: """Get entity token with all related tokens (i.e. prepositional modifiers) so, we are extracting such token tree with entity entity mod & compound mod & compound prep pobj | pcomp mod & compound mod & compound (repeat) prep pobj | pcomp mod & compound mod & compound (repeat) ... """ entity_string = SpacyEventExtractor._get_chunk(entity) word = entity while True: prep, word = SpacyEventExtractor._get_prep_with_word(word) if word is None: break entity_string += " " + prep return entity_string @staticmethod def _replace_we(replace_we, string): """Replace pronoun 'we' in string with string 'replace_we'""" new_string = "" for word in string.split(): if word == "we" and replace_we is not None: new_string += replace_we + " " elif word == "We" and replace_we is not None: new_string += replace_we.capitalize() + " " else: new_string += str(word) + " " return new_string @staticmethod def _remove_extra_whitespaces(text): return ' '.join(text.strip().split()) @staticmethod def _get_entity1(span): """Get nominal subject of the span's root, if there is one""" for word in span: if word.head is word: # main verb for child in word.children: if child.dep_.endswith("nsubj"): return child break return None @staticmethod def _get_action(verb): """Get auxiliary verbs of the given verb and the verb itself""" aux_verbs = "" for child in verb.children: if child.dep_ == "aux" or child.dep_ == "neg": aux_verbs += str(child) return SpacyEventExtractor._remove_extra_whitespaces(str(aux_verbs) + ' ' + str(verb)) @staticmethod def _get_entity2(verb): """Get direct object of the given verb, if there is one""" for child in verb.children: if child.dep_ == "dobj": return child return None @staticmethod def extract(text: str, replace_we: str = None) -> [Event]: # just because sometimes spaCy fails on sth like we've for aux, replace_with in [('ve', 'have'), ('re', 'are')]: text = text.replace("'" + aux, " " + replace_with).replace("’" + aux, " " + replace_with) # replacing known_phrases for abbr, full in SpacyEventExtractor._known_phrases: reg = re.compile(abbr, re.IGNORECASE) text = reg.sub(full, text) if len(text) == 0: return [] text_doc = SpacyEventExtractor._nlp(text) events = [] keywords_set = set(SpacyEventExtractor._keywords) for doc in text_doc.sents: # if there is no at least one keyword - we ignore that sentence if len(set([word.string.strip().lower() for word in doc]) & keywords_set) == 0: continue entity1 = SpacyEventExtractor._get_entity1(doc) if not entity1: continue verb = entity1.head entity2 = SpacyEventExtractor._get_entity2(verb) if SpacyEventExtractor._is_present_simple(verb) or \ SpacyEventExtractor._is_present_continuous(verb): continue entity1_string = SpacyEventExtractor._get_full_entity(entity1) entity2_string = SpacyEventExtractor._get_full_entity(entity2) entity1_string = SpacyEventExtractor._replace_we(replace_we, entity1_string) entity2_string = SpacyEventExtractor._replace_we(replace_we, entity2_string) entity1_string = SpacyEventExtractor._remove_extra_whitespaces(entity1_string) entity2_string = SpacyEventExtractor._remove_extra_whitespaces(entity2_string) # if there is no keywords in token and subj_string if len(set([word.strip().lower() for word in entity1_string.split()]) & keywords_set) + \ len(set(word.strip().lower() for word in entity2_string.split()) & keywords_set) == 0: continue if SpacyEventExtractor._have_pronouns(entity1_string) or \ SpacyEventExtractor._have_pronouns(entity2_string): continue # entity2 can be empty only in some special cases like: IDEA 2.0 released if verb.lemma_.lower() not in SpacyEventExtractor._important_actions and entity2_string == "": continue action_string = SpacyEventExtractor._get_action(verb) event = Event(entity1_string, entity2_string, action_string, str(doc)) events.append(event) print(event) return events
# coding: utf-8 from __future__ import unicode_literals import pytest from spacy.en import English from spacy.en import attrs EN = English() def test_attr_of_token(): text = u'An example sentence.' tokens = EN(text) example = EN.vocab[u'example'] assert example.orth != example.shape feats_array = tokens.to_array((attrs.ORTH, attrs.SHAPE)) assert feats_array[0][0] != feats_array[0][1] def test_tag(): text = u'A nice sentence.' tokens = EN(text) assert tokens[0].tag != tokens[1].tag != tokens[2].tag != tokens[3].tag feats_array = tokens.to_array((attrs.ORTH, attrs.TAG)) assert feats_array[0][1] == tokens[0].tag assert feats_array[1][1] == tokens[1].tag assert feats_array[2][1] == tokens[2].tag assert feats_array[3][1] == tokens[3].tag
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None, **kwargs): """ Uses spaCy to quickly tokenize text and return an array of indices. This method stores a global NLP directory in memory, and takes up to a minute to run for the time. Later calls will have the tokenizer in memory. Parameters ---------- text : list of unicode strings These are the input documents. There can be multiple sentences per item in the list. max_length : int This is the maximum number of words per document. If the document is shorter then this number it will be padded to this length. skip : int, optional Short documents will be padded with this variable up until max_length. attr : int, from spacy.attrs What to transform the token to. Choice must be in spacy.attrs, and = common choices are (LOWER, LEMMA) merge : int, optional Merge noun phrases into a single token. Useful for turning 'New York' into a single token. nlp : None A spaCy NLP object. Useful for not reinstantiating the object multiple times. kwargs : dict, optional Any further argument will be sent to the spaCy tokenizer. For extra speed consider setting tag=False, parse=False, entity=False, or n_threads=8. Returns ------- arr : 2D array of ints Has shape (len(texts), max_length). Each value represents the word index. vocab : dict Keys are the word index, and values are the string. The pad index gets mapped to None >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"] >>> arr, vocab = tokenize(sents, 10, merge=True) >>> arr.shape[0] 2 >>> arr.shape[1] 10 >>> w2i = {w: i for i, w in vocab.iteritems()} >>> arr[0, 0] == w2i[u'do'] # First word and its index should match True >>> arr[0, 1] == w2i[u'you'] True >>> arr[0, -1] # last word in 0th document is a pad word -2 >>> arr[0, 4] == w2i[u'class action lawsuit'] # noun phrase is tokenized True >>> arr[1, 1] # The URL token is thrown out -2 """ if nlp is None: nlp = English() data = np.zeros((len(texts), max_length), dtype='int32') data[:] = skip bad_deps = ('amod', 'compound') for row, doc in enumerate(tqdm(list(nlp.pipe(texts, **kwargs)), desc="tokenizing")): if merge: # from the spaCy blog, an example on how to merge # noun phrases into single tokens for phrase in doc.noun_chunks: # Only keep adjectives and nouns, e.g. "good ideas" while len(phrase) > 1 and phrase[0].dep_ not in bad_deps: phrase = phrase[1:] if len(phrase) > 1: # Merge the tokens, e.g. good_ideas phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_) # Iterate over named entities for ent in doc.ents: if len(ent) > 1: # Merge them into single tokens ent.merge(ent.root.tag_, ent.text, ent.label_) dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32') if len(dat) > 0: dat = dat.astype('int32') msg = "Negative indices reserved for special tokens" assert dat.min() >= 0, msg # Replace email and URL tokens idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) dat[idx] = skip length = min(len(dat), max_length) data[row, :length] = dat[:length, 0].ravel() uniques = np.unique(data) vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip} vocab[skip] = '<SKIP>' return data, vocab
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Feb 28 09:26:02 2017 @author: arlittr """ from spacy.en import English parser = English() import pandas as pd from nltk.corpus import stopwords as stopwords import networkx as nx import string import matplotlib.pyplot as plt from tqdm import tqdm import numpy as np from math import inf import hdbscan from datetime import datetime def cleanPassage(rawtext): #some code from https://nicschrading.com/project/Intro-to-NLP-with-spaCy/ #if data is bad, return empty if type(rawtext) is not str: return '' #split text with punctuation
from spacy.en import English, LOCAL_DATA_DIR import spacy.en import os, time data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) nlp = English(parser=False, tagger=True, entity=False) def print_fine_pos(token): return token.tag_ def pos_tags(sentence): # sentence = str(sentence, "utf-8") # sentence = sentence.decode("utf-8") tokens = nlp(sentence) tags = [] for tok in tokens: tags.append((tok, print_fine_pos(tok))) words = [] for (pos, tag) in tags: words.append(pos.text) print(words) return tags start = time.time() a = "The dosa was brilliant and so was the samosa" print(pos_tags(a))
# Normalize text - This function requires modification and improvements def clean_text(text): for i in range(0,len(mapOfWords)): if len(mapOfWords['Original'].iloc[i].split()) <= 1: text = re.sub( r'\b'+ mapOfWords['Original'].iloc[i] +'([.,\s]|$)', ' ' + mapOfWords['Map'].iloc[i] + ' ',text, flags=re.IGNORECASE) else: text = re.sub( r''+ mapOfWords['Original'].iloc[i] +'([.,\s]|$)', ' ' + mapOfWords['Map'].iloc[i] + ' ',text, flags=re.IGNORECASE) text = text.strip() # Trim string return text # Some extra steps for normalization and removing unnecessary spaces and some improvements of observed issues nlp = English() # required both for sentence tokenization and lemmatization nlp.vocab.morphology.lemmatizer.exc[u'verb'][u'need'] = ('need',) nlp.vocab.morphology.lemmatizer.exc[u'noun'][u'tier'] = ('tier',) nlp.vocab.morphology.lemmatizer.exc[u'adj'][u'tier'] = ('tier',) def calling_clean_text(text): text = re.sub( '\bcant\b','cannot',text, flags=re.IGNORECASE) text = re.sub( 'can\'t','cannot',text, flags=re.IGNORECASE) text = re.sub( 'i\'m','I am',text, flags=re.IGNORECASE) text = re.sub( 'won\'t','will not',text, flags=re.IGNORECASE) text = re.sub( 'n\'t',' not',text, flags=re.IGNORECASE) text = re.sub( '\'s','s',text, flags=re.IGNORECASE) text = re.sub( '\'ve',' have',text, flags=re.IGNORECASE) text = re.sub( '%',' percent ',text) text = re.sub(r'[^\w\s](?<![\-.,%\'])',' ',text) #text = re.sub(r'\w\s(?<![\-.,])',' ',text)
""" Function that takes a text and returns an xml object containing the NAF. """ doc = nlp(text) time = current_time() return naf_from_doc(doc, time=time) def NAF_to_string(NAF, byte=False): """ Function that takes an XML object containing NAF, and returns it as a string. If byte is True, then the output is a bytestring. """ xml_string = etree.tostring(NAF, pretty_print=True, with_comments=True) if byte: return xml_string else: return xml_string.decode('utf-8') # Command line functionality: given name of a file, process the file contents and # print the NAF to stdout. if __name__ == '__main__': import sys from spacy.en import English nlp = English() with open(sys.argv[1]) as f: text = f.read() NAF = text_to_NAF(text, nlp) print(NAF_to_string(NAF))
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None, **kwargs): """ Uses spaCy to quickly tokenize text and return an array of indices. This method stores a global NLP directory in memory, and takes up to a minute to run for the time. Later calls will have the tokenizer in memory. Parameters ---------- text : list of unicode strings These are the input documents. There can be multiple sentences per item in the list. max_length : int This is the maximum number of words per document. If the document is shorter then this number it will be padded to this length. skip : int, optional Short documents will be padded with this variable up until max_length. attr : int, from spacy.attrs What to transform the token to. Choice must be in spacy.attrs, and = common choices are (LOWER, LEMMA) merge : int, optional Merge noun phrases into a single token. Useful for turning 'New York' into a single token. nlp : None A spaCy NLP object. Useful for not reinstantiating the object multiple times. kwargs : dict, optional Any further argument will be sent to the spaCy tokenizer. For extra speed consider setting tag=False, parse=False, entity=False, or n_threads=8. Returns ------- arr : 2D array of ints Has shape (len(texts), max_length). Each value represents the word index. vocab : dict Keys are the word index, and values are the string. The pad index gets mapped to None >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"] >>> arr, vocab = tokenize(sents, 10, merge=True) >>> arr.shape[0] 2 >>> arr.shape[1] 10 >>> w2i = {w: i for i, w in vocab.iteritems()} >>> arr[0, 0] == w2i[u'do'] # First word and its index should match True >>> arr[0, 1] == w2i[u'you'] True >>> arr[0, -1] # last word in 0th document is a pad word -2 >>> arr[0, 4] == w2i[u'class action lawsuit'] # noun phrase is tokenized True >>> arr[1, 1] # The URL token is thrown out -2 """ if nlp is None: nlp = English() data = np.zeros((len(texts), max_length), dtype='int32') data[:] = skip bad_deps = ('amod', 'compound') for row, doc in enumerate(nlp.pipe(texts, **kwargs)): if merge: # from the spaCy blog, an example on how to merge # noun phrases into single tokens for phrase in doc.noun_chunks: # Only keep adjectives and nouns, e.g. "good ideas" while len(phrase) > 1 and phrase[0].dep_ not in bad_deps: phrase = phrase[1:] if len(phrase) > 1: # Merge the tokens, e.g. good_ideas phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_) # Iterate over named entities for ent in doc.ents: if len(ent) > 1: # Merge them into single tokens ent.merge(ent.root.tag_, ent.text, ent.label_) dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32') if len(dat) > 0: dat = dat.astype('int32') msg = "Negative indices reserved for special tokens" assert dat.min() >= 0, msg # Replace email and URL tokens idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) dat[idx] = skip length = min(len(dat), max_length) data[row, :length] = dat[:length, 0].ravel() uniques = np.unique(data) vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip} vocab[skip] = '<SKIP>' return data, vocab