def build_data(fnamex, fnamey, num_layers, max_seq_len): fdx, fdy = open(fnamex), open(fnamey) x_token_list = [] y_token_list = [] # we need to fill in the entire dataset linex, liney = fdx.readline(), fdy.readline() while linex and liney: x_tokens, y_tokens = util.tokenize(linex), util.tokenize(liney) # this is not truncating...just ignoring if len(x_tokens) < max_seq_len and len(y_tokens) < max_seq_len: x_token_list.append(x_tokens) y_token_list.append(y_tokens) linex, liney = fdx.readline(), fdy.readline() y_token_list = add_sos_eos(y_token_list) # shift y by 1 position x_padded, y_padded = padded(x_token_list, num_layers), padded(y_token_list, 1) source_tokens = np.array(x_padded).T source_mask = (source_tokens != PAD_ID).astype(np.int32) target_tokens = np.array(y_padded).T target_mask = (target_tokens != PAD_ID).astype(np.int32) return source_tokens, source_mask, target_tokens, target_mask
def get_similarity(in_phrase, words, embeddings): # word embeddings is assumed to be a dict of {word: embedding} pairs of type {string: np array} # words is assumed to be a list of lists of form [word, freq], types - [str, int] # if it is a string then it is converted to the [[w1,f1],[w2,f2],...] form # words = ast.literal_eval(words) if isinstance(words, str): words = util.word_freq( util.tokenize(words)) # -----> tokenizing takes out NEs by default a = np.zeros( len(embeddings['example'] )) # initialize vector to same dimension as preloaded embeddings for word in util.tokenize(in_phrase): a = a + embeddings[ word] # -----> this should also be weighted more by thing than category a = a / np.sqrt(np.dot(a, a)) # -----> normalize b = np.zeros(len(embeddings['example'])) for [word, freq] in words: if word in embeddings.keys(): b = b + freq * embeddings[word] # weights b = b / np.sqrt(np.dot(b, b)) # -----> normalize similarity = np.dot(a, b) # normalized weighted inner product by frequency return similarity
def get_distance(in_phrase, words, embeddings): # word embeddings is assumed to be a dict of {word: embedding} pairs of type {string: np array} # words is assumed to be a list of lists of form [word, freq], types - [str, int] # if it is a string then it is converted to the [[w1,f1],[w2,f2],...] form # words = ast.literal_eval(words) if isinstance(words, str): words = util.word_freq( util.tokenize(words)) # -----> tokenizing takes out NEs by default a = np.zeros( len(embeddings['example'] )) # initialize vector to same dimension as preloaded embeddings for word in util.tokenize(in_phrase): a = a + embeddings[word] a = a / np.sqrt(np.dot(a, a)) b = np.zeros(len(embeddings['example'])) for [word, freq] in words: if word in embeddings.keys(): b = b + freq * embeddings[word] # weights b = b / np.sqrt(np.dot(b, b)) distance = np.sqrt(np.sum( np.square(a - b))) # euclidean distance between weighted vectors distance = ( 2 - distance ) / 2 # -------> transform so that more relevant targets get a higher score from this function return distance
def get_similarities(): global model check_model() comparison = int(flask.request.args.get("article")) ctoken = util.tokenize([util.fetch(comparison)[1]]) recent_articles = util.fetch_top_100(comparison) recent_article_ids = [a[0] for a in recent_articles] founds, not_founds = util.fetch_with(comparison, recent_article_ids) result_ids = [f[0] for f in founds] similarities = [f[1] for f in founds] not_found_articles = [a for a in recent_articles if a[0] in not_founds] for a in not_found_articles: sim = model.docvecs.similarity_unseen_docs(model, ctoken[0], util.tokenize([a[1]])[0]) util.insert_similarity(comparison, a[0], sim.item()) result_ids.append(a[0]) similarities.append(str(sim)) return flask.jsonify( result="ok", articleIds=result_ids, similarities=similarities )
def main(): parser = argparse.ArgumentParser("Creates a result file indicating the sentence-difference baseline.") parser.add_argument("--vectorspace", "-v", help="Vector space file.") parser.add_argument("--sentences", "-s", help="The appropriate .txt file in resources.") parser.add_argument("--vectormaker", choices=("add", "mul"), default="add", help="Use vector addition or multiplication.") args = parser.parse_args() if args.vectormaker == "add": combiner = operator.add elif args.vectormaker == "mul": combiner = operator.mul else: sys.stderr.write("Don't know what to do with vectormaker '%s'." % args.vectormaker) sys.exit(1) sentence_pairs = [l.strip().lower().split("\t") for l in open(args.sentences)] tokenized_sentences = [(tokenize(left), tokenize(right)) for left, right in sentence_pairs] vectorspace = load_vectorspace(args.vectorspace) sys.stdout.write("Errors found: 0\n") sys.stdout.write("[") for left_sent, right_sent in tokenized_sentences: left_pieces, right_pieces = find_difference(left_sent, right_sent) features = extract_features(left_pieces, right_pieces, vectorspace, combiner) sys.stdout.write(" " + ",".join(map(str, features))) sys.stdout.write("]\n")
def train1(english, french, trans_prob, loop_count=20): for i in range(loop_count): print("Running Iteration..", i + 1) count = collections.defaultdict(float) total = collections.defaultdict(float) count = collections.defaultdict(float) sum_total = {} for (english, french) in zip(english, french): english = tokenize(english) french = tokenize(french) for e in english: sum_total[e] = 0.0 for f in french: sum_total[e] += trans_prob[(e, f)] for e in english: for f in french: count[(e, f)] += trans_prob[(e, f)] / sum_total[e] total[f] += trans_prob[(e, f)] / sum_total[e] for (e, f) in count.keys(): trans_prob[(e, f)] = count[(e, f)] / total[f] pickle.dump(trans_prob, \ open('../OutputFiles/map1_'+str(size)+'_'+str(i+1)+'.pickle','wb')) return trans_prob
def tokenize_qa(qa, context): tokenized_qa = list() a = qa["answers"][0] question_answer = dict() question_answer["question"] = tokenize(qa["question"]) question_answer["answer_begin"], question_answer["answer_end"] = \ find_subtext(context, tokenize(a["text"])) return question_answer
def initialize(options): if not OPENCL: options.no_ocl = True return [] options.worksize = tokenize(options.worksize, 'worksize') options.frames = tokenize(options.frames, 'frames', [30]) options.frameSleep = tokenize(options.frameSleep, 'frameSleep', cast=float) options.vectors = [True] if options.old_vectors else tokenize( options.vectors, 'vectors', [False], bool) platforms = cl.get_platforms() if options.platform >= len(platforms) or (options.platform == -1 and len(platforms) > 1): print( 'Wrong platform or more than one OpenCL platforms found, use --platform to select one of the following\n' ) for i in range(len(platforms)): print('[%d]\t%s' % (i, platforms[i].name)) sys.exit() if options.platform == -1: options.platform = 0 devices = platforms[options.platform].get_devices() if not options.device and devices: print('\nOpenCL devices:\n') for i in range(len(devices)): print('[%d]\t%s' % (i, devices[i].name)) print('\nNo devices specified, using all GPU devices\n') miners = [ OpenCLMiner(i, options) for i in range(len(devices)) if ((not options.device and devices[i].type == cl.device_type.GPU) or ( i in options.device)) ] for i in range(len(miners)): miners[i].worksize = options.worksize[min(i, len(options.worksize) - 1)] miners[i].frames = options.frames[min(i, len(options.frames) - 1)] miners[i].frameSleep = options.frameSleep[min( i, len(options.frameSleep) - 1)] miners[i].vectors = options.vectors[min(i, len(options.vectors) - 1)] miners[i].cutoff_temp = options.cutoff_temp[min( i, len(options.cutoff_temp) - 1)] miners[i].cutoff_interval = options.cutoff_interval[min( i, len(options.cutoff_interval) - 1)] return miners
def main(): parser = argparse.ArgumentParser("Creates a result file by counting occurrences of key phrases.") parser.add_argument("--sentences", "-s", help="The appropriate .txt file in resources.") args = parser.parse_args() sentence_pairs = [l.strip().lower().split("\t") for l in open(args.sentences)] tokenized_sentences = [(tokenize(left), tokenize(right)) for left, right in sentence_pairs] sys.stdout.write("Errors found: 0\n") sys.stdout.write("[") for left_sent, right_sent in tokenized_sentences: features = extract_features(left_sent, right_sent) sys.stdout.write(" " + ",".join(map(str, features))) sys.stdout.write("]\n")
def compute_inverted_index(coll_folder, stemming, output_file_path_ii): if not os.path.isfile(output_file_path_ii): print('computing inverted index') inverted_idx = {} sw = util.load_indri_stopwords() doc_n = 0 for filename in tqdm(os.listdir(coll_folder)): fp = os.path.join(coll_folder, filename) doc_id = filename.split(r'.')[0] if os.path.isfile(fp): doc_n += 1 d = util.tokenize(' '.join(open(fp, 'r').readlines()), stemming, stoplist=sw) set_w_in_doc = set(d) for w in set_w_in_doc: if w in inverted_idx.keys(): inverted_idx[w].append((doc_id, d.count(w))) else: inverted_idx[w] = [(doc_id, d.count(w))] util.save_model(inverted_idx, output_file_path_ii) else: inverted_idx = util.load_model(output_file_path_ii) return inverted_idx
def encode_collection(text_by_name_p, word_dict_path, encoded_out_folder): # word_dict_path = '/media/alberto/DATA/BaiduNetdiskDownload/data/word_dict.txt' text_by_name = {} print('reading files in folder') for filename in tqdm(os.listdir(text_by_name_p)): fp = os.path.join(text_by_name_p, filename) if os.path.isfile(fp): text_by_name[filename.split(r'.')[0]] = ' '.join( open(fp, 'r').readlines()) print('reading word2vec model') encoded_docs_by_name = {} wi = {} for line in tqdm(open(word_dict_path)): data = line.split() word = data[0].strip() wid = int(data[1].strip()) if word not in wi.keys(): wi[word] = wid sw = load_indri_stopwords() print('encoding data') for dn, dc in tqdm(text_by_name.items()): td = util.tokenize(dc, stemming=False, stoplist=sw) encoded_doc = [wi[w] for w in td if w in wi.keys()] util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn)) encoded_docs_by_name[dn] = encoded_doc return encoded_docs_by_name
def test_tokens_expr(): src, names = expr_fix assert list(util.tokenize(src)) == [ TokenInfo(type=2, string='2', start=(1, 0), end=(1, 1), line='2 + 3'), TokenInfo(type=14, string='+', start=(1, 2), end=(1, 3), line='2 + 3'), TokenInfo(type=2, string='3', start=(1, 4), end=(1, 5), line='2 + 3'), TokenInfo(type=0, string='', start=(2, 0), end=(2, 0), line='')]
def tokenize_data(X): ''' Stores token lists of questions and response in dataframe X''' X['t_q'] = X['Question Text'].apply(tokenize) X['t_r'] = X['Response'].apply(tokenize) X['t_r_f'] = X['Response'].apply(lambda x: tokenize(first_sentence(x))) return X
def get_we_matrix_wi_encode_docs_w_fasttext(ftext_model_path, docs_text_main_folder, encoded_out_folder_docs): f = load_model(ftext_model_path) text_by_name = {} print('reading files in folder') for filename in tqdm(os.listdir(docs_text_main_folder)): fp = os.path.join(docs_text_main_folder, filename) if os.path.isfile(fp): text_by_name[filename.split(r'.')[0]] = ' '.join( open(fp, 'r').readlines()) stoplist = load_indri_stopwords() print('encoding collection') encoded_docs_by_name = {} wi = {} we_matrix = [] for dn, dt in tqdm(text_by_name.items()): tok_doc = util.tokenize(dt, stemming=False, stoplist=stoplist) encoded_doc = [] for tok in tok_doc: if tok not in wi.keys(): wv = f.get_word_vector(tok) wi[tok] = len(wi) we_matrix.append(wv) encoded_doc.append(wi[tok]) util.save_model(encoded_doc, os.path.join(encoded_out_folder_docs, dn)) encoded_docs_by_name[dn] = encoded_doc return encoded_docs_by_name, wi, we_matrix
def extract_emoticon(text): emoticons = [] for token, tag in tag_tweets(tokenize(text)): if tag == "emoticon": emoticons.append(token) # text = text.replace(token, "") return emoticons, text
def make_concepts_baseline(id, path, sents, query): """ only use first sentences TODO: choose best of first 3 """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) max_order = 0 for sent in sents: ## store this sentence's concepts sent.concepts = set([]) concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) > 0: for concept in concepts: all_concepts[concept].add(sent.doc) if sent.order == 0: for concept in concepts: all_concepts[concept].add(sent.doc + 'first') ## ignore some sents if sent.order == 0: max_order = 0 skip = False if sent.length <= 5: skip = True if sent.tok in seen_sents: skip = True #if sent.length < 20: skip = True if sent.order > max_order or max_order > 0: skip = True max_order = 0 if skip: max_order += 1 continue #print sent.order, max_order, sent.doc, sent seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) #if count < 3: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def indexDoc(self, doc): # indexing a Document object """ indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead""" self.nDocs += 1 tokens = util.tokenize(doc.title + "\n" + doc.body) for i, token in enumerate(tokens): if token not in self.items: self.items[token] = IndexItem(token) self.items[token].add(doc.docID, i)
def test_tokens_expr(): src, names = expr_fix assert list(util.tokenize(src)) == [ TokenInfo(type=2, string='2', start=(1, 0), end=(1, 1), line='2 + 3'), TokenInfo(type=14, string='+', start=(1, 2), end=(1, 3), line='2 + 3'), TokenInfo(type=2, string='3', start=(1, 4), end=(1, 5), line='2 + 3'), TokenInfo(type=0, string='', start=(2, 0), end=(2, 0), line='') ]
def process_input(tweet): """ Process input data to tokenize and make labels. """ tokens = tokenize(to_ascii(tweet.text)) label = LABELS.index(tweet.topic) assert label >= 0 return tweet.id, tokens, label
def give_loc_tf(text): ls_text = [x.lower() for x in tokenize(text)] freq = FreqDist(ls_text) ls = freq.most_common(len(freq)) loc_tf = {} for x in ls: loc_tf[x[0]] = x[1] return loc_tf
def getAmper(lines: str): out = [] for line in tokenize(lines, add=(",", "+", "-", "*", '=')): start = line.find("$") if start == -1: continue out.append(line) return out
def initialize(options): if not OPENCL: options.no_ocl = True return [] options.worksize = tokenize(options.worksize, 'worksize') options.frames = tokenize(options.frames, 'frames', [30]) options.frameSleep = tokenize(options.frameSleep, 'frameSleep', cast=float) options.vectors = [True] if options.old_vectors else tokenize(options.vectors, 'vectors', [False], bool) platforms = cl.get_platforms() if options.platform >= len(platforms) or (options.platform == -1 and len(platforms) > 1): print 'Wrong platform or more than one OpenCL platforms found, use --platform to select one of the following\n' for i in xrange(len(platforms)): print '[%d]\t%s' % (i, platforms[i].name) sys.exit() if options.platform == -1: options.platform = 0 devices = platforms[options.platform].get_devices() if not options.device and devices: print '\nOpenCL devices:\n' for i in xrange(len(devices)): print '[%d]\t%s' % (i, devices[i].name) print '\nNo devices specified, using all GPU devices\n' miners = [ OpenCLMiner(i, options) for i in xrange(len(devices)) if ( (not options.device and devices[i].type == cl.device_type.GPU) or (i in options.device) ) ] for i in xrange(len(miners)): miners[i].worksize = options.worksize[min(i, len(options.worksize) - 1)] miners[i].frames = options.frames[min(i, len(options.frames) - 1)] miners[i].frameSleep = options.frameSleep[min(i, len(options.frameSleep) - 1)] miners[i].vectors = options.vectors[min(i, len(options.vectors) - 1)] miners[i].cutoff_temp = options.cutoff_temp[min(i, len(options.cutoff_temp) - 1)] miners[i].cutoff_interval = options.cutoff_interval[min(i, len(options.cutoff_interval) - 1)] return miners
def _tokenize_text_for_nodes(nodes, email_text, email_idents, stops): documents = [] for node in nodes: relevant_emails = email_idents[node.parent][node.payload] relevant_emails += email_idents[node.payload][node.parent] fulltext = '\n'.join(email_text.get(ident, '') for ident in relevant_emails) documents.append(util.tokenize(fulltext, stops)) return documents
def encode_collection_with_stemming(text_by_name_p, word_dict_path, w2v_model_path, encoded_out_folder, wi=None, word_embeddings_matrix=None): text_by_name = {} print('reading files in folder') for filename in tqdm(os.listdir(text_by_name_p)): fp = os.path.join(text_by_name_p, filename) if os.path.isfile(fp): text_by_name[filename.split(r'.')[0]] = ' '.join( open(fp, 'r').readlines()) # initialize embeddings matrix if word_embeddings_matrix is None: # read and adapt word index if wi is None: wi = {} wids_to_merge = {} for line in tqdm(open(word_dict_path)): data = line.split() word_stemmed = util.stem(data[0].strip()) wid = int(data[1].strip()) if word_stemmed not in wi.keys(): wi[word_stemmed] = len(wi) wids_to_merge[word_stemmed] = [wid] else: wids_to_merge[word_stemmed].append(wid) we_size = 50 word_embeddings_matrix = np.float32( np.random.uniform(-0.02, 0.02, [len(wi) + 1, we_size])) padding_value = np.zeros(we_size) word_embeddings_matrix[word_embeddings_matrix.shape[0] - 1] = padding_value w2v_model = load_w2v_we(w2v_model_path) for k, v in wi.items(): we = np.zeros(we_size) summed_something = False for wid in wids_to_merge[k]: if wid in w2v_model.keys(): we = np.sum((we, w2v_model[wid]), axis=0) summed_something = True if summed_something: we = we / np.linalg.norm(we) # normalize new word embedding word_embeddings_matrix[v] = we encoded_docs_by_name = {} sw = load_indri_stopwords() print('encoding data') for dn, dc in tqdm(text_by_name.items()): td = util.tokenize(dc, stemming=True, stoplist=sw) encoded_doc = [wi[w] for w in td if w in wi.keys()] util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn)) encoded_docs_by_name[dn] = encoded_doc return encoded_docs_by_name, wi, word_embeddings_matrix
def _get_operator_name(self): if "operator" in self.df.columns: opname = self.df.operator.astype(str).mode().iloc[0] else: opname = util.tokenize(self.filename, exclude=self.exclude, take_basename=True)[0] self.operator = Operator(opname) return self
def onCreate(self, create=True): super(Operands, self).onCreate() for token in tokenize(self.token, (",",)): if isExpression(token): o = Expr(token, self.line) else: o = Operand(token, self.line) if create: o.onCreate() self.operands.append(o)
def encode_queries(queries_main_folder, wi, stemming): sw = util.load_indri_stopwords() encoded_qbn = {} for filename in tqdm(os.listdir(queries_main_folder)): fp = os.path.join(queries_main_folder, filename) if os.path.isfile(fp): tokenized_query = util.tokenize(' '.join(open(fp, 'r').readlines()), stemming=stemming, stoplist=sw) qn = filename.split(r'.')[0] encoded_qbn[qn] = [wi[w] for w in tokenized_query if w in wi.keys()] return encoded_qbn
def test(): print( "test1:displays number of features generated from all the documents\n") f = open('feature_definition_file', 'r') count = 0 k = f.readline() while k: count += 1 k = f.readline() print("number of features generated in feature_defintion_file : " + str(count)) f.close() print( "test2:verified that all the documents are read and parsed from the mininewsgroup directory\n" ) f = open('training_data_file', 'r') count = 0 k = f.readline() while k: count += 1 k = f.readline() print("number of documents parsed from mininewsgroup : " + str(count)) print("test3 : Given a filename and filepath parse the document\n") fil = open('class_definition_file', "r") classes = {} r = fil.readline() while r: p = str(r.strip()).split(" ") if p[0] in classes: classes[p[0]].append(p[1]) else: classes[p[0]] = [p[1]] r = fil.readline() fil.close() directorypath = input( "Enter the filepath (eg:localpath/mini_newsgroups/alt.atheism/51121):\n" ) ngobj = news.News(directorypath, classes) print("DOCID : " + ngobj.docID) print("Newsgroup : " + ngobj.newsgroup) print("Class : " + ngobj.class_label) print("Subject : " + ngobj.subject) print("Body : " + ngobj.body) print("test4\n") print( "Tokenizing the subject and body of the above given file,removing stop words and stemming: \n" ) print(util.tokenize(ngobj.subject + " " + ngobj.body)) print("test5 : printing inverted index of the given file\n") indexobjtest = InvertedIndex() indexobjtest.indexDoc(ngobj) for key in indexobjtest.items: print(key + " " + str(ngobj.docID) + " " + str(indexobjtest.items[key].posting[ngobj.docID].positions))
def get_meta_text(meta_dict): """ Returns a list of 2-element dictionaries of the form [word, freq] from the metadata in the HTML. """ if meta_dict is None: return None descriptions = list((meta_dict['descriptions'])) titles = list((meta_dict['titles'])) keywords = list(meta_dict['keywords']) text_data = [] if len(descriptions) > 0: for d in descriptions: text_data.extend(util.tokenize(d)) if len(titles) > 0: for t in titles: text_data.extend(util.tokenize(t)) if len(keywords) > 0: for k in keywords: text_data.extend(util.tokenize(k)) return util.word_freq(text_data)
def build_qid_map(outlines): qid_map = {} heading_map = {} with logger.duration('reading outlines'): for outline_qid, outline_text in tqdm(util.read_outlines(outlines)): qid_map['/'.join(outline_qid)] = outline_qid for heading, text in zip(outline_qid, outline_text): heading_map[heading] = util.tokenize(text) logger.debug('found {} qids, {} headings'.format( len(qid_map), len(heading_map))) return qid_map, heading_map
def tokenize_paragraph(par): tok_par = dict() tok_par['context'] = tokenize(par['context']) tok_par['qas'] = list() for qas in par['qas']: try: tok_par['qas'].append(tokenize_qa(qas, tok_par["context"])) except NameError as ne: #print(ne) '' return tok_par
def executeQuery(self): query = open(self.nomeArqQuery).read().lower() query = util.tokenize(query) indObj = index.Index(self.nomeArqBase, self.nomeArqIndice) ind = indObj.loadIndex() indArqs = self.base.keys() #substitui os tokens pelos indices for i, v in enumerate(query): if v not in self.OPERATORS: query[i] = [int(oc.doc) for oc in ind[v]] #NOT while True: flag = False for i, v in enumerate(query): if v == "!": flag = True del query[i] #remove operator query[i] = conditions.Condition.notCondition(query[i], indArqs) if not flag: break #AND while True: flag = False for i, v in enumerate(query): if v == "&": flag = True del query[i] #remove operator query[i - 1] = conditions.Condition.andCondition(query[i-1], query[i]) #execute intersection del query[i] #remove one of the lists if not flag: break #OR while True: flag = False for i, v in enumerate(query): if (v == "|"): flag = True del query[i] #remove operator query[i - 1] = conditions.Condition.orCondition(query[i-1], query[i]) #execute intersection del query[i] #remove one of the lists if not flag: break query[0].sort() return [self.base[q] for q in query[0]]
def get_keywords(text): data = {} for i, sentence in enumerate(get_sentences(text), start=1): tagged_tokens = pos_tag(tokenize(sentence)) for term in get_terms(chunk(tagged_tokens)): _keywords(term, i, data) keywords = [{ 'keyword': v['term_forms'][0], 'count': v['count'], 'locations': v['locations'] } for k, v in data.items()] return sorted(keywords, key=itemgetter('count'), reverse=True)
def tokenize_input(input_sent, vocab): """ Return a numpy array where each row is the word-indexes for each sentence """ input_tok = [] # map text to integers for sent in input_sent: text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)] input_tok.append(np.array(text_int)) return np.array(input_tok)
def main(): parser = argparse.ArgumentParser( "Creates a result file indicating the sentence-difference baseline.") parser.add_argument("--vectorspace", "-v", help="Vector space file.") parser.add_argument("--sentences", "-s", help="The appropriate .txt file in resources.") parser.add_argument("--vectormaker", choices=("add", "mul"), default="add", help="Use vector addition or multiplication.") args = parser.parse_args() if args.vectormaker == "add": combiner = operator.add elif args.vectormaker == "mul": combiner = operator.mul else: sys.stderr.write("Don't know what to do with vectormaker '%s'." % args.vectormaker) sys.exit(1) sentence_pairs = [ l.strip().lower().split("\t") for l in open(args.sentences) ] tokenized_sentences = [(tokenize(left), tokenize(right)) for left, right in sentence_pairs] vectorspace = load_vectorspace(args.vectorspace) sys.stdout.write("Errors found: 0\n") sys.stdout.write("[") for left_sent, right_sent in tokenized_sentences: left_pieces, right_pieces = find_difference(left_sent, right_sent) features = extract_features(left_pieces, right_pieces, vectorspace, combiner) sys.stdout.write(" " + ",".join(map(str, features))) sys.stdout.write("]\n")
def prep_docs(path, out_path): files = os.popen('ls %s*.sent' %path).read().splitlines() ## on the first pass, create a vocab mapping vocab = set() for file in files: if '-B' in file: continue sents = open(file).read().splitlines() doc = prob_util.Counter() for sent in sents[:20]: s = util.porter_stem_sent(util.tokenize(fix_text(sent))) concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True)) vocab.update(concepts) fh = open(out_path+'vocab', 'w') vocab = zip(vocab, range(len(vocab))) for concept, count in vocab: fh.write('%s %d\n' %(concept, count)) fh.close() vocab = dict(vocab) ## on the second pass, output one doc per line for file in files: if '-B' in file: continue sents = open(file).read().splitlines() doc = prob_util.Counter() for sent in sents[:20]: s = util.porter_stem_sent(util.tokenize(fix_text(sent))) concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True)) for concept in concepts: doc[concept] += 1 ## doc output output = '%d %s' %(len(doc), ' '.join(['%s:%d' %(vocab[t],c) for t,c in doc.items()])) print output
def trainAll(): trans_prob = collections.defaultdict(_constant_factory(1e-6)) i = 0 while i < len(english): if (i % size == 0): print("Parallel corpus Added") global parallel_corpus trans_prob = train1(english, french, trans_prob) with open('OutputFiles/transProb' + str(i) + '.pickle', 'wb') as fp: pickle.dump(trans_prob, fp) print("Done Training for lines ", i, "\n\n") parallel_corpus = [] print(trans_prob[('on', 'sur')]) parallel_corpus.append((tokenize(english[i]), tokenize(french[i]))) i += 1 trans_prob = train1(english, french, trans_prob) return trans_prob
def __init__(self, id, order, orig, doc, tok=None, parse=None, par=None, unresolved=False): self.id = id self.order = order self.orig = orig self.tok = tok self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig))) self.doc = doc self.parse = parse self.new_par = (par == '1') self.length = len(self.orig.split()) self.depends = set() self.groups = [] self.skip = False self.skip_concepts = False self.unresolved = unresolved self.atleast = ""
def createIndex(self): indice = {} i = 1 arquivosTexto = self.loadBase() for arquivo in arquivosTexto.values(): arqTexto = open(arquivo) #separando as palavras tokens = util.tokenize(arqTexto.read().replace(".", " ").replace( ",", " ").replace("!", " ").replace("?", " ").replace("\n", " ").lower()) #para cada token for token in tokens: #se o token já estiver no dictionary if token in indice: #se o índíce do arquivo já estiver vinculado ao token if i in indice[token]: indice[token][ i] += 1 #incrementa o número de ocorrências else: indice[token][i] = 1 #inicializa com uma ocorrência else: indice[token] = {i: 1} #inicializa o dictionary i += 1 arqTexto.close() #ordena o índice indice = OrderedDict(sorted(indice.items())) #abre o arquivo de índice arqIndice = open(self.nomeArqIndiceResult, 'w') #para cada token no índice for tok, dic in indice.items(): linha = tok + ":" for arq, ocor in dic.items(): #para cada arquivo que contém token linha = linha + " " + str(arq) + "," + str(ocor) arqIndice.write(linha + "\n") #grava a linha #fecha os arquivos arqIndice.close()
def correct(sentence): sentence = preprocess(sentence) tokens = tokenize(sentence) print('segment sentens is:', ''.join([str(token) for token in tokens])) seg_range = [[token[1], token[2]] for token in tokens] _, _, maybe_error_range = score_sentence(sentence) maybe_error_ranges = [] if maybe_error_range: print('maybe error range:', maybe_error_range) maybe_error_ranges = merge_ranges(overlap_ranges(maybe_error_range, seg_range)) for range in maybe_error_ranges: start_index, end_index = range print('maybe error words:', sentence[start_index:end_index]) corrected_words = correct_chars(sentence, start_index, end_index) print('corrected words:', corrected_words) sentence = sentence[:start_index] + corrected_words + sentence[end_index:] return sentence, maybe_error_ranges
def make_concepts_gold(id, path, sents, gold_sents): ## get gold concepts all_concepts = collections.defaultdict(set) for sent in gold_sents: concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) for concept in concepts: all_concepts[concept].add(sent.doc) ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) if util.is_just_stopwords(concept.split("_")): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) ## get sentence concepts seen_sents = set() for sent_index in range(len(sents)): sent = sents[sent_index] sent.concepts = set([]) ## skip some sents skip = False # if sent.order >= 3: skip = True if not sent.new_par: skip = True if sent.length < 20: skip = True if sent.orig in seen_sents: skip = True if sent.length <= 5: skip = True if skip: continue seen_sents.add(sent.orig) s = util.porter_stem_sent(util.tokenize(fix_text(sent.orig))) concepts = set(util.get_ngrams(s, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def do_run(args): """ Run the neural net to predict on new data. """ # Load the model and weights model = load_model(args.model, args.weights) wvecs = WordVectorModel.from_file(args.wvecs, False, '*UNKNOWN*') data = ((tweet.id, tokenize(to_ascii(tweet.text))) for tweet in RowObjectFactory.from_stream(csv.reader(args.input, delimiter="\t"))) writer = csv.writer(args.output, delimiter='\t') writer.writerow(['id',] + LABELS) for ix in tqdm(grouper(args.batch_size, data)): ids_batch, X_batch = zip(*ix) X_batch = wvecs.embed_sentences(X_batch) labels = model.predict_on_batch(X_batch) for id, label in zip(ids_batch, labels): writer.writerow([id,] + [float(l) for l in label])
def parse_line(line, init=u'english', tokenize=False): import re stack = [init] bracketRec = re.compile(ur'(<([^> ]+)\s*[^>]*>)') to_return = [] proper_line = [x.strip() for x in bracketRec.sub(ur' <\2> ', line).split()] if tokenize: import util # workaround for angle bracket chars to_join = [] for w in proper_line: if bracketRec.match(w): to_join.append(w.replace(u'<', u'__begin__').replace(u'>', u'__end__')) else: to_join.append(w) to_tok = [u' '.join(to_join)] tokenized = util.tokenize(to_tok)[0].split() proper_line = [] for t in tokenized: proper_line.append(t.replace(u'__begin__', u'<').replace(u'__end__', u'>')) for w in proper_line: logging.info(u'current word: {}'.format(w)) m = bracketRec.match(w) if m is not None: lang_string = m.groups()[1] if lang_string == stack[-1]: logging.warn(u'language {} already appeared on stack for word {} in line \n {}'.format(lang_string, w, line)) if lang_string.startswith('/'): stack.pop() if len(stack) == 0: logging.warn(u'stack empty after popping {}'.format(lang_string)) stack.append(init) else: stack.append(lang_string) logging.info(u'new language: {}'.format(lang_string)) continue else: to_return.append((w, stack[-1])) return to_return
def __init__(self, bytes, id, order, orig, doc, tok=None, par=None, unresolved=False, lang='fr'): self.id = id self.order = order self.orig = orig self.tok = tok if lang=='en': self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig))) elif lang=='fr': self.tok2 = " ".join([stmr.stem(w) for w in nltk.tokenize.word_tokenize(self.orig.decode('utf8'))]) else: print 'Unsupported language...' sys.exit(0) # print "TOK2", len(self.tok2) self.doc = doc self.new_par = (par == '1') if bytes >- 1: self.length = len(self.orig) #for bytes else: self.length = len(self.orig.split()) self.unresolved = unresolved
def train(): """ Accept text messages from the user and whether or not it's a positive or negative message to train a classifier. """ db = util.Database() c = db.getCursor() while True: s = raw_input("Enter sentence or \".\" to stop training: ") if s == ".": break tokens = repr(util.tokenize(s)) l = raw_input( "Is this a positive (1), neutral(0) or negative(-1) sentence: " ) label = int(l) c.execute("INSERT INTO training_data VALUES (?,?)", (tokens, label)) db.close()
stops = stops or [] counts = defaultdict(int) for token in tokens: if token not in stops: counts[token] += 1 return sorted((count, token) for token, count in counts.items()) def top_tokens(tokens): return [item for item in reversed(tokens[-10:])] def unique_tokens(tokens): return [token for count, token in tokens if count == 1] token_map = {} for tweet in tweets(): for token, tag in tag_tweets(tokenize(tweet['text'])): token_map.setdefault(tag, []).append(token) for tag, tokens in token_map.items(): print "% 10s % 8s % 6s" % (tag, len(tokens), len(set(tokens))) print "" print "EMOTICONS" print "=========" counted = count_tokens(token_map['emoticon']) print top_tokens(counted) print len(unique_tokens(counted)) print "" print "URLS" print "===="
def testcorrect_token_contents(self): self.assertEquals( util.tokenize(" \t\n\rfred jim \tbob \t"), ["fred", "jim", "bob"])
def testcorrect_num_tokens(self): self.assertEquals( len(util.tokenize("a b c d \t\n\rfred")), 5)
def testepsilon(self): self.assertEquals( util.tokenize(""), [])
def make_concepts_exp(id, path, sents, query): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def make_concepts_compress2(id, path, sents, query, compressed_sents): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) ## different processing for set A and set B if '-B' in id: first_weight = 2 count_thresh = 4 query_thresh = 0 else: first_weight = 1 count_thresh = 3 query_thresh = 1 for sent in sents: ## store this sentence's concepts sent.concepts = set() concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) >= query_thresh: for concept in concepts: if sent.order == 0: all_concepts[concept].add('first' + sent.doc) else: all_concepts[concept].add(sent.doc) ## ignore some sents skip = False #if not sent.new_par: skip = True #if sent.length <= 20: skip = True if sent.tok in seen_sents: skip = True #if sent.ignore: skip = True if skip: continue seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) firsts = len([1 for d in docs if 'first' in d]) count = count + (first_weight * firsts) if count < count_thresh: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) for sent in compressed_sents: sent.concepts = set([]) if sent.unresolved: continue if sent.length < 10: continue if re.match('^["(].*[")]$', sent.orig): skip = True concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(compressed_sents, final_concepts, path+id)
def testonly_whitespace(self): self.assertEquals( util.tokenize("\t\n\r "), [])
group.add_option('--vv', dest='vectors', default=[], help='use vectors, default false') group.add_option('-v', '--vectors', dest='old_vectors',action='store_true', help='use vectors') parser.add_option_group(group) (options, options.servers) = parser.parse_args() log.verbose = options.verbose log.quiet = options.quiet options.rate = if_else(options.verbose, max(options.rate, 60), max(options.rate, 0.1)) options.version = VERSION options.max_update_time = 60 options.device = tokenize(options.device, 'device', []) options.cutoff_temp = tokenize(options.cutoff_temp, 'cutoff_temp', [95], float) options.cutoff_interval = tokenize(options.cutoff_interval, 'cutoff_interval', [0.01], float) switch = None try: switch = Switch(options) if not options.no_ocl: import OpenCLMiner for miner in OpenCLMiner.initialize(options): switch.add_miner(miner) if not options.no_bfl: import BFLMiner
def filterComment(source, startLine, codeFile, commentFile, maxBucket): """ Find the comment at line i in the list source. When found check for a multiline comment and get the corresponding code """ comment = "" indentation = -1 currIndent = -1 code = [] globalI = len(source) + 10 # loop through all the lines in the source, get the comment # and the corresponding code with open(commentFile, "a") as commentF: with open(codeFile, "a") as codeF: for i in xrange(startLine, len(source)): globalI = i line = source[i] # comments need to be directly above code if line.strip() == "" and comment == "": return (i,False) # Continue if we have an divider row if line.replace("#", "").strip() == "" and line.strip() != "": continue # check if it is an comment, and if so add it to the comment if line.strip()[:2] in commentList: comment += line.strip().replace("#", "") + " " continue # lines with docstrings are skipped if '"""' in line or "'''" in line: return (i,False) # if we get here, it means we are not in the comment anymore # First get the indentation level of the current line of code currIndent = len(line) - len(line.lstrip()) # If it is the first line of code, set our indentation level if indentation == -1: indentation = currIndent # if we hit an empty line and have no code yet, return with an error if line.strip() == "" and code == []: return (i,False) # if we hit an empty line or go to an parent piece in the code # return the gathered code if line.strip() == "" or indentation > currIndent or (any(c in line for c in commentList)): code = util.cleanCode(code) # no need to save code-comment pairs larger than maxBucket size if util.tokenize("".join(code)) < maxBucket[0] and util.tokenize(comment) < maxBucket[1] \ and not (any(exc in comment.lower() for exc in commentExceptions)): # write to file for j in xrange(len(code)): codeF.write(code[j] + "\n") codeF.write("!@#$%!@#$%!@#$%!@#$%!@#$%") commentF.write(util.cleanComment(comment) + "\n!@#$%!@#$%!@#$%!@#$%!@#$%") return (i,True) else: return (i,False) # add the line to our code if all is well (without any inline comments if any) if line.strip() != "": code.append(line) code = util.cleanCode(code) # if we are here check if we have a comment / code not empty and smaller than maxBucket size if comment.strip() != "" and code != [] and \ util.tokenize("".join(code)) < maxBucket[0] and util.tokenize(comment) < maxBucket[1] \ and not (any(exc in comment.lower() for exc in commentExceptions)): # write to file for j in xrange(len(code)): codeF.write(code[j] + "\n") codeF.write("!@#$%!@#$%!@#$%!@#$%!@#$%") commentF.write(util.cleanComment(comment) + "\n!@#$%!@#$%!@#$%!@#$%!@#$%") return (globalI+1,True) else: return (globalI+1,False)
def __iter__(self): for text in util.iter_corpus(self.dirname): # tokenize each work yield util.tokenize(text)
def tokenize(src): for token_info in util.tokenize(src): token_class = TOKENS[token_info.type] yield token_class(*token_info[1:])
def load_data(path, file_ext=['txt'], valid_split=None, vocab_file_name=None, max_vocab_size=None, max_len_w=None, output_path=None, subset_pct=100): """ Given a path where data are saved, look for the ones with the right extensions If a split factor is given, it will split all the files into training and valid set. Then build vocabulary from the training and validation sets. Arguments: path: which directory to look for all the documents file_ext: what extension of the files to look for valid_split: to split the data into train/valid set. If None, no split vocab_file_name: optional file name. If None, the script will decide a name given path and split max_vocab_size: maximum number of words to use in vocabulary (by most frequent) max_len_w: maximum length of sentences in words output_path: path used to save preprocessed data and resuts subset_pct: subset of dataset to load into H5 file (percentage) Returns: The function saves 2 files: h5 file with preprocessed data vocabulary file with: vocab, reverse_vocab, word_count """ file_names = get_file_list(path, file_ext) file_str = get_file_str(path, len(file_names), labelled=False, valid_split=valid_split, subset_pct=subset_pct) # create output dir if needed if not os.path.isdir(output_path): os.makedirs(output_path) # file name to store the vocabulary if vocab_file_name is None: vocab_file_name = file_str + '.vocab' vocab_file_name = os.path.join(output_path, vocab_file_name) # If max sizes arent set, assume no limit if not max_len_w: max_len_w = sys.maxsize if not max_vocab_size: max_vocab_size = sys.maxsize # file name to store the pre-processed train/valid dataset h5_file_name = os.path.join(output_path, file_str + '.h5') if os.path.exists(h5_file_name) and os.path.exists(vocab_file_name): neon_logger.display("dataset files {} and vocabulary file {} already exist. " "will use cached data. ".format(h5_file_name, vocab_file_name)) return h5_file_name, vocab_file_name # split into training/valid set if valid_split is not None: if 'json' in file_ext: # Split based on number of files train_split = int(np.ceil(len(file_names) * (1 - valid_split))) train_files = file_names[:train_split] valid_files = file_names[train_split:] train_sent = load_json_sent(train_files, subset_pct) valid_sent = load_json_sent(valid_files, subset_pct) all_sent = train_sent + valid_sent elif 'txt' in file_ext: # Split based on number of lines (since only 2 files) all_sent = load_txt_sent(file_names, subset_pct) train_split = int(np.ceil(len(all_sent) * (1 - valid_split))) train_sent = all_sent[:train_split] valid_sent = all_sent[train_split:] else: neon_logger.display("Unsure how to load file_ext {}, please use 'json' or 'txt'." .format(file_ext)) else: train_files = file_names if 'json' in file_ext: train_sent = load_json_sent(train_files, subset_pct) elif 'txt' in file_ext: train_sent = load_txt_sent(train_files, subset_pct) else: neon_logger.display("Unsure how to load file_ext {}, please use 'json' or 'txt'." .format(file_ext)) all_sent = train_sent if os.path.exists(vocab_file_name): neon_logger.display("open existing vocab file: {}".format(vocab_file_name)) vocab, rev_vocab, word_count = load_obj(vocab_file_name) else: neon_logger.display("Building vocab file") # build vocab word_count = defaultdict(int) for sent in all_sent: sent_words = tokenize(sent) if len(sent_words) > max_len_w or len(sent_words) == 0: continue for word in sent_words: word_count[word] += 1 # sort the word_count , re-assign ids by its frequency. Useful for downstream tasks # only done for train vocab vocab_sorted = sorted(word_count.items(), key=lambda kv: kv[1], reverse=True) vocab = OrderedDict() # get word count as array in same ordering as vocab (but with maximum length) word_count_ = np.zeros((len(word_count), ), dtype=np.int64) for i, t in enumerate(list(zip(*vocab_sorted))[0][:max_vocab_size]): word_count_[i] = word_count[t] vocab[t] = i word_count = word_count_ # generate the reverse vocab rev_vocab = dict((wrd_id, wrd) for wrd, wrd_id in vocab.items()) neon_logger.display("vocabulary from {} is saved into {}".format(path, vocab_file_name)) save_obj((vocab, rev_vocab, word_count), vocab_file_name) vocab_size = len(vocab) neon_logger.display("\nVocab size from the dataset is: {}".format(vocab_size)) neon_logger.display("\nProcessing and saving training data into {}".format(h5_file_name)) # now process and save the train/valid data h5f = h5py.File(h5_file_name, 'w', libver='latest') shape, maxshape = (len(train_sent),), (None) dt = np.dtype([('text', h5py.special_dtype(vlen=str)), ('num_words', np.uint16)]) report_text_train = h5f.create_dataset('report_train', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') report_train = h5f.create_dataset('train', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') # map text to integers wdata = np.zeros((1, ), dtype=dt) ntrain = 0 for sent in train_sent: text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)] # enforce maximum sentence length if len(text_int) > max_len_w or len(text_int) == 0: continue report_train[ntrain] = text_int wdata['text'] = clean_string(sent) wdata['num_words'] = len(text_int) report_text_train[ntrain] = wdata ntrain += 1 report_train.attrs['nsample'] = ntrain report_train.attrs['vocab_size'] = vocab_size report_text_train.attrs['nsample'] = ntrain report_text_train.attrs['vocab_size'] = vocab_size if valid_split: neon_logger.display("\nProcessing and saving validation data into {}".format(h5_file_name)) shape = (len(valid_sent),) report_text_valid = h5f.create_dataset('report_valid', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') report_valid = h5f.create_dataset('valid', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') nvalid = 0 for sent in valid_sent: text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)] # enforce maximum sentence length if len(text_int) > max_len_w or len(text_int) == 0: continue report_valid[nvalid] = text_int wdata['text'] = clean_string(sent) wdata['num_words'] = len(text_int) report_text_valid[nvalid] = wdata nvalid += 1 report_valid.attrs['nsample'] = nvalid report_valid.attrs['vocab_size'] = vocab_size report_text_valid.attrs['nsample'] = nvalid report_text_valid.attrs['vocab_size'] = vocab_size h5f.close() return h5_file_name, vocab_file_name