def test_allocate(self): # Small data context = "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child." context_toks = tokenize(context) anchors = align(context, context_toks) query = ['Houston', ',', 'Texas'] start_char = 19 end_char = 32 span = Span.allocate(anchors, start_char, end_char) self.assertEqual(span.start, 4) self.assertEqual(span.end, 6) for k in range(span.start, span.end + 1): self.assertEqual(context_toks[k], query[k - span.start]) # Real data context = "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"." context_toks = tokenize(context) anchors = align(context, context_toks) query = ['Dangerously', 'in', 'Love'] start_char = 505 end_char = 523 span = Span.allocate(anchors, start_char, end_char) self.assertEqual(span.start, 108) self.assertEqual(span.end, 110) for k in range(span.start, span.end + 1): self.assertEqual(context_toks[k], query[k - span.start])
def test_parse_json(self): train_json = json.load(open(SQUAD_JSON_FILE, 'rt')) para_js = train_json['data'][0]['paragraphs'][0] context = para_js['context'] context_toks = tokenize(context) anchors = align(context, context_toks) answer_json = para_js['qas'][0]['answers'] answers = Answer.parse_json(answer_json, context, context_toks, anchors) self.assertEqual(answers[0].span.start, 56) self.assertEqual(answers[0].span.end, 59) self.assertEqual(answers[0].answer_toks, ['in', 'the', 'late', '1990s']) para_js = train_json['data'][0]['paragraphs'][3] context = para_js['context'] context_toks = tokenize(context) anchors = align(context, context_toks) answer_json = para_js['qas'][8]['answers'] answers = Answer.parse_json(answer_json, context, context_toks, anchors) # self.assertEqual(answers[0].span) self.assertIsNotNone(answers[0].span) print(answers[0].span)
def test_tokenize(self): self.assertEqual( tokenize( "What is the title of his first commercially successful work?" ), [ 'What', 'is', 'the', 'title', 'of', 'his', 'first', 'commercially', 'successful', 'work', '?' ]) self.assertEqual(tokenize("Rondo Op. 1."), ['Rondo', 'Op', '.', '1', '.'])
def ask(self, context, query_text): vocab = self.data.vocab # Parse context raw_context = context context_toks = tokenize_long_text(context) context_toks = [t.strip(' ') for t in context_toks] context_chars = to_chars(context_toks, cf.WORD_LEN, cf.PAD_CHAR) contextw = vocab.vectorize(context_toks, cf.CONTEXT_LEN) contextc = vocab.vectorize_c(context_chars, cf.CONTEXT_LEN, cf.WORD_LEN) # Parse query q_toks = tokenize(query_text) queryw = vocab.vectorize(q_toks, cf.QUERY_LEN) question_chars = to_chars(q_toks, cf.WORD_LEN, cf.PAD_CHAR) queryc = vocab.vectorize_c(question_chars, cf.QUERY_LEN, cf.WORD_LEN) # Build input X_batch = [[np.array(contextw)], [np.array(queryw)], [np.array(contextc)], [np.array(queryc)]] # Predict p1, p2, starts, ends = self.keras_model.predict_on_batch(X_batch) start = int(np.squeeze(starts, axis=-1)[0]) end = int(np.squeeze(ends, axis=-1)[0]) answer = [context_toks[i] for i in range(start, end + 1)] return answer
def main(): translation_table = get_word_translations("100kword_trans.csv") translator = DirectTrans(translation_table) english = tokenize("data/100ktok.low.en") spanish = tokenize("data/100ktok.low.es") training_set, test_set, translated_set = get_datasets(english, spanish) test_output = open('trans_direct.txt','w') for i in range(len(test_set)): test_output.write(' '.join(translator.translate(test_set[i])) + "\n") test_output.close()
def main(): english = tokenize("data/100ktok.low.en") spanish = tokenize("data/100ktok.low.es") training_set, test_set, translated_set = get_datasets(english, spanish) translations = get_word_translations("3000_trans.txt") search = BeamSearch(training_set, translations) test_output = open('trans_beam.txt','w') true_output = open('trans_true.txt','w') for i in range(len(test_set)): print "Translating sentence", i, "..." test_output.write(' '.join(search.translate(test_set[i])) + "\n") true_output.write(' '.join(translated_set[i]) + "\n") test_output.close() true_output.close()
def get_document_abstract_norm(uri): doc = get_current(uri) bow = utilities.tokenize(doc['abstract'], max_sent=5) doc['abstract_norm'] = ' '.join(bow) doc['abstract_token'] = list(set([t for t in bow if len(t) > 5]))[:15] return doc
def main(): english = tokenize("data/100ktok.low.en") spanish = tokenize("data/100ktok.low.es") training_set, test_set, translated_set = get_datasets(english, spanish) translations = get_word_translations("3000_trans.txt") print "Original Sentence:", ' '.join(test_set[0]) translator = DirectTrans(translations) print "Direct Translation:", ' '.join(translator.translate(test_set[0])) test_output = open('trans_beam.txt','w') true_output = open('trans_true.txt','w') search = BeamSearch(training_set, translations) print "Beam Translation:", ' '.join(search.translate(test_set[0])) print "True Translation:", ' '.join(translated_set[0])
def create(self, stopword_list=[], stemming_func=None): """ Input Parameters: * stopword_list: a list of words to be removed from the collection. ["the","a","an","of"...] * stemming_func: a function for stemming. Output: * the following two files in disk. (1) docs.map: This file will contain a large number of lines with mapping of document names and their ids. Every line contains two numbers separated by a single comma: the document name and its internal id. Example: ------- simple000001.txt,1 simple001234.txt,2 simple987654.txt,3 ... ... (2) iindex.map: By now, this file contains a large number of lines, one for every single token found in the text and their occurences in the files of this collection. The output are posting lists for the collection that look like this: wordA,doc1:freq1,doc2:freq2,doc3:freq3.... wordB,doc3:freq3,doc5:freq5,doc6:freq6.... """ for f in self.files: text = open_file(f) tokens = tokenize(text, stopword_list, stemming_func) # ... # .... # .... pass # Save to document mappings to disk: with open("docs.map","w") as fout: #fout.write(...) pass # Save iindex to disk: with open("iindex.map","w") as fout: #fout.write(...) pass
def home(request): data = {'title': "pdf converter"} if request.method == 'GET': return render(request, 'pdftotext/home.html', data) elif request.method == 'POST': data['pdf-to-text'] = "" data['is_converted'] = False if request.FILES["pdf-file"]: pdf = request.FILES["pdf-file"] keywords = request.POST.get("key-words") pdfText = PdfText(pdf=pdf) pdfText.save() pdfPath = pdfText.pdf.path text = pdf_to_text(pdfPath) data['pdf_to_text'] = text data['is_converted'] = True global IS_CASE_SENSITIVE if not IS_CASE_SENSITIVE : text = text.lower() keywords = keywords.lower() histogramDict = histogram(tokenize(text)) keywords = tokenize(keywords) keywordCount = dict() for keyword in keywords: count = get_count(keyword, histogramDict) keywordCount[keyword] = count data['keywords'] = keywordCount # # start print "------------------------------------" # hist = histogram(tokenize(histogram)) # for key, value in histogramDict.items(): # print str(key) + ":" + str(value) # # end return render(request, 'pdftotext/home.html', data)
def parse_json(cls, answers_js: List[dict], context: str, context_toks: List[str], anchors: List[int]): answers = [] for ans in answers_js: ans_text = ans['text'] ans_start = ans['answer_start'] ans_toks = tokenize(ans_text) # Identify the span from context, ans_text & start index span = Span.allocate(anchors, ans_start, ans_start + len(ans_text) - 1) answers.append(Answer(ans_text, ans_toks, span, ans_start)) return answers
def tokenize_and_vectorize(sentence, vector_dictionary): """Return tokens and vector for sentence""" tokens = tokenize(sentence) pos = tag(tokens) length = len(tokens) tokens = remove_stop_words(tokens) vector = sentence_vector(tokens, vector_dictionary) if vector is DO_NOT_INCLUDE: return vector return { 'sentence': sentence, 'tokens': tokens, 'sentence_vec': vector, 'pos': pos, 'length': length }
def parse_json(cls, para_js: dict, para_idx: int): # Accumulate all answers' tokens first all_para_answers = [] for q in para_js['qas']: if 'answers' in q: all_para_answers.extend([ans for ans in q['answers']]) if 'plausible_answers' in q: all_para_answers.extend( [ans for ans in q['plausible_answers']]) # Improve the context for better tokenization raw_context = para_js['context'] # context = augment_long_text(para_js['context'], all_para_answers) context = raw_context context_toks = tokenize_long_text(context) context_toks = [t.strip(' ') for t in context_toks] anchors = align(raw_context, context_toks) questions = [] for q in para_js['qas']: question_text = q['question'] q_toks = tokenize(question_text) ques_id = q['id'] answers = Answer.parse_json(q['answers'], raw_context, context_toks, anchors) if 'answers' in q else [] plausible_answers = Answer.parse_json( q['plausible_answers'], raw_context, context_toks, anchors) if 'plausible_answers' in q else [] questions.append( Question(question_text, ques_id, q_toks, answers, plausible_answers)) para = Paragraph(raw_context, context, context_toks, questions, para_idx, anchors) for ques in questions: ques.set_paragraph(para) return para
def transform(record, uri): ''' Extract the relevant data and return a Solr document dict. ''' document = {} # The main DBpedia URI as document id document['id'] = uri # Other language, Wikidata uris if uri.startswith('http://nl.'): document['uri_nl'] = uri else: document['uri_en'] = uri if PROP_SAME_AS in record: for u in record[PROP_SAME_AS]: if uri.startswith('http://nl.'): if u.startswith('http://dbpedia.org/resource/'): document['uri_en'] = u if u.startswith('http://www.wikidata.org/entity/'): document['uri_wd'] = u # The first (i.e. Dutch if available) label document['label'] = record[PROP_LABEL][0] # Normalized pref label, based on the label without specification # between brackets pref_label = utilities.normalize(remove_spec(document['label'])) document['pref_label'] = pref_label document['pref_label_str'] = pref_label # The first (i.e. Dutch if available) abstract try: document['abstract'] = record[PROP_ABSTRACT][0] except Exception as e: document['abstract'] = '.' bow = utilities.tokenize(document['abstract'], max_sent=5) document['abstract_norm'] = ' '.join(bow) document['abstract_token'] = list(set([t for t in bow if len(t) > 5]))[:15] # Language of the (primary) resource description document['lang'] = 'nl' if uri.startswith('http://nl.') else 'en' # Number of links and inlinks (max of Dutch and English counts) if PROP_LINK in record: document['outlinks'] = len(record[PROP_LINK]) document['inlinks'] = max(record['inlinks']) # Number of times label appears in newspaper index document['inlinks_newspapers'] = ddd_jsru(pref_label) # Set ambiguity flag if specification between brackets present in URI and # save the specification if '_(' in uri and uri.endswith(')'): document['ambig'] = 1 document['spec'] = utilities.normalize(uri_to_string(uri, True)) else: document['ambig'] = 0 # Normalized alt labels extracted form various name fields as well as # redirects cand = record[PROP_LABEL][1:] cand += record[PROP_NAME] if PROP_REDIRECT in record: # Exclude English redirects if there are too many if len([ u for u in record[PROP_REDIRECT] if u.startswith('http://dbpedia.org/resource/') ]) > 100: cand += [ uri_to_string(u) for u in record[PROP_REDIRECT] if u.startswith('http://nl.dbpedia.org/resource/') ] else: cand += [uri_to_string(u) for u in record[PROP_REDIRECT]] # Include disambiguations for acronyms if PROP_DISAMBIGUATES in record: for u in record[PROP_DISAMBIGUATES]: s = uri_to_string(u) if len(s) >= 2 and len(s) <= 5 and s.isupper(): cand.append(s) # Include Wikidata aliases if document.get('uri_wd'): wd_cand = get_wd_aliases(document.get('uri_wd')) cand += wd_cand wd_alt_label = clean_labels(wd_cand, pref_label) document['wd_alt_label'] = wd_alt_label document['wd_alt_label_str'] = wd_alt_label alt_label = clean_labels(cand, pref_label) document['alt_label'] = alt_label document['alt_label_str'] = alt_label # Keywords extracted from Dutch DBpedia category links, e.g. # http://nl.dbpedia.org/resource/Categorie:Amerikaans_hoogleraar # should return ['amerikaans', 'hoogleraar'] if PROP_LINK in record: keywords = [] for link in record[PROP_LINK]: if link.startswith('http://nl.dbpedia.org/resource/Categorie:'): s = uri_to_string(link).split('Categorie:')[1] # Crude stop word filtering. Use list instead? keywords += [ k for k in utilities.normalize(s).split() if len(k) >= 5 ] keywords = list(set(keywords)) for k in pref_label.split(): if k in keywords: keywords.remove(k) document['keyword'] = keywords # DBpedia ontology and schema.org types if PROP_TYPE in record: document['dbo_type'] = list( set([ t.split('/')[-1] for t in record[PROP_TYPE] if t.startswith('http://dbpedia.org/ontology/') and t.find('Wikidata:') < 0 and t.find('%') < 0 ])) document['schema_type'] = list( set([ t.split('/')[-1] for t in record[PROP_TYPE] if t.startswith('http://schema.org/') ])) # Predicted topics and types resp = requests.get(TOPICS_URL, params={'url': uri}, timeout=300) if resp.status_code != 200: raise Exception('Error retrieving topics') resp = resp.json() for t in resp['topics']: document['topic_{}'.format(t)] = resp['topics'][t] for t in resp['types']: document['dbo_type_{}'.format(t)] = resp['types'][t] # Probable last name, for persons only if (('dbo_type' in document and 'Person' in document['dbo_type']) or ('dbo_type' not in document and document['dbo_type_person'] >= 0.75)): last_part = utilities.get_last_part(pref_label, exclude_first_part=True) if last_part: document['last_part'] = last_part document['last_part_str'] = last_part # Birth and death dates, taking the minimum of multiple birth date options # and the maximum of multiple death dates # E.g. -013-10-07+01:00 if PROP_BIRTH_DATE in record: cand = [] for date in record[PROP_BIRTH_DATE]: try: cand.append(int(date[:4])) except Exception as e: continue if cand: document['birth_year'] = min(cand) if PROP_DEATH_DATE in record: cand = [] for date in record[PROP_DEATH_DATE]: try: cand.append(int(date[:4])) except Exception as e: continue if cand: document['death_year'] = max(cand) # Birth and death places, giving preference to Dutch options nl_resource = 'http://nl.dbpedia.org/resource/' en_resource = 'http://dbpedia.org/resource/' if PROP_BIRTH_PLACE in record: places = [ utilities.normalize(uri_to_string(p)) for p in record[PROP_BIRTH_PLACE] if p.startswith(nl_resource) ] if not places: places = [ utilities.normalize(uri_to_string(p)) for p in record[PROP_BIRTH_PLACE] if p.startswith(en_resource) ] document['birth_place'] = list(set(places)) if PROP_DEATH_PLACE in record: places = [ utilities.normalize(uri_to_string(p)) for p in record[PROP_DEATH_PLACE] if p.startswith(nl_resource) ] if not places: places = [ utilities.normalize(uri_to_string(p)) for p in record[PROP_DEATH_PLACE] if p.startswith(en_resource) ] document['death_place'] = list(set(places)) # OCR tolerant labels if 'pref_label' in document: pref_label_ocr = utilities.normalize_ocr(document['pref_label']) document['pref_label_ocr'] = pref_label_ocr document['pref_label_str_ocr'] = pref_label_ocr if 'alt_label' in document: alt_label_ocr = [ utilities.normalize_ocr(label) for label in document['alt_label'] ] document['alt_label_ocr'] = alt_label_ocr document['alt_label_str_ocr'] = alt_label_ocr if 'last_part' in document: last_part_ocr = utilities.normalize_ocr(document['last_part']) document['last_part_ocr'] = last_part_ocr document['last_part_str_ocr'] = last_part_ocr # Vectors # Wikidata if 'uri_wd' in document: payload = {'source': document['uri_wd'].split('/')[-1]} response = requests.get(W2V_URL, params=payload, timeout=300) data = response.json() if data['vectors']: data = [float('{0:.3f}'.format(f)) for f in data['vectors'][0]] document['vector'] = json.dumps(data) # Abstract and keyword tokens tokens = [] if 'abstract_token' in document: tokens.extend(document['abstract_token']) if 'keyword' in document: tokens.extend(document['keyword']) if tokens: if 'pref_label' in document: tokens = [ t for t in tokens if t not in document['pref_label'].split() ] tokens = [ t for t in tokens if t not in dictionary.unwanted and len(t) >= 5 ] payload = {'source': ' '.join(list(set(tokens)))} response = requests.get(W2V_URL, params=payload, timeout=300) data = response.json()['vectors'] if data: document['abstract_vector'] = [ json.dumps([float('{0:.3f}'.format(f)) for f in v]) for v in data ] return document
def merge_otherft(sentences_set): for sent in sentences_set: sent.tokens= tokenize(sent.content) # tokenize all sentences if sent.labeled_aspects == 'other features': sent.labeled_aspects = 'no feature' # convert all other features to no feature
#! /usr/bin/env python3 import os, sys, time, re from utilities import tokenize, runProgram, exitShell from utilities import changeDirectory """ The following is based on: -) https://github.com/robustUTEP/os-demos/blob/master/ch5-api/p3-exec.py -) https://brennan.io/2015/01/16/write-a-shell-in-c/ """ # set the status an impossibly large number as an initial start status = sys.maxsize args = '' while True: args = tokenize(' ', input("$ ")) if args[0] == 'exit': exitShell(args) elif args[0] == 'pwd': # print the working directory os.write(1, (os.getcwd() + '\n').encode()) continue elif args[0] == 'cd': try: os.chdir(args[1]) except: os.write(2, ("path not valid: %s\n" % args[1]).encode()) continue
def prepare_word_list(self): """Create words lists from documents""" print "pre-processing text..." self.texts = [[word.lower() for word in utils.tokenize(document) if word not in utils.SW] for document in self.documents] self.word_counts = [float(len(text)) for text in self.texts]