def add_sentence(self, sentence, tokenize, sentence_no): words = ['<s>'] + helper.tokenize(sentence, tokenize) + ['</s>'] self.sentence1_str = sentence if sentence_no == 1: self.sentence1 = words else: self.sentence2 = words
def worker_task(articles, args, worker_id, Dict): random.seed(args.seed) sys.stdout.write("{}: Begin processing article files...\n".format(worker_id)) article_file_count = 0 for article_file in articles: sys.stdout.write("{}: Begin processing file {}\n".format(worker_id, article_file)) # process file, tokenize tokenized_sentences = {} # it's a dictionary with {index:tokenized_sentence} tokenized_labels = {} it = 0 with open(join(args.inputpath, article_file), "r") as F: for line in F: if line.startswith("<doc") or line.startswith("</doc>"): continue sentences = [filter_with_alphabet(s, args.alphabet) for s in re.split(args.separator, sanitize_line(line))] for i in range(len(sentences)): tmp = tokenize(Dict, sentences[i], gram_length, token_weight) tokenized_sentences[it] = tmp tokenized_labels[it] = get_gram_label(Dict_gram_to_label, tmp) it += 1 with open(join(args.outputpath + "/readable_articles", article_file + ".readable"), "w") as F: F.write(json.dumps(tokenized_sentences)) with open(join(args.outputpath + "/articles", article_file), "w") as F: F.write(json.dumps(tokenized_labels)) article_file_count += 1 sys.stdout.write("{}: Finished processing {}th file {}\n".format(worker_id, article_file_count, article_file))
def form_vocabulary(self, in_file, tokenize): """Creates the vocabulary.""" assert os.path.exists(in_file) with open(in_file, 'r') as f: for line in f: session = json.loads(line) assert len(session['query']) == len(session['clicks']) for qidx in range(len(session['query'])): query_terms = helper.tokenize(session['query'][qidx][0], tokenize) query_letter_n_grams = self.get_letter_n_grams(query_terms, self.order_n_gram) self.add_letter_n_grams(query_letter_n_grams) for i in range(len(session['clicks'][qidx])): doc_title = session['clicks'][qidx][i][1] title_terms = helper.tokenize(doc_title, tokenize) doc_letter_n_grams = self.get_letter_n_grams(title_terms, self.order_n_gram) self.add_letter_n_grams(doc_letter_n_grams)
def add_sentence(self, sentence, tokenize, sentence_no, dictionary, is_test_instance): words = ['<s>'] + helper.tokenize(sentence, tokenize) + ['</s>'] if not is_test_instance: for word in words: dictionary.add_word(word) if sentence_no == 1: self.sentence1 = words else: self.sentence2 = words
def add_pos_tags(self, sentence, tokenize): """We assume that the raw sentence will be passed in -> this is for self-attentive network :param sentence: raw sentence not tokenized :return: void """ tokenized_sent = helper.tokenize(sentence, tokenize) pos_tags = pos_tag(tokenized_sent) self.pos_tags = [tag[1] for tag in pos_tags] for i in range(len(self.pos_tags)): if self.pos_tags[i] not in pos_to_idx: self.pos_tags[i] = 'UNK'
def train(self, data, category): self.prior_counts[category] += 1 words = tokenize(data) count_dict = count_words(words) for word, count in count_dict.items(): if word not in self.vocab: self.vocab[ word] = 0.0 # use 0.0 here so Python does "correct" math if word not in self.word_counts[category]: self.word_counts[category][word] = 0.0 self.vocab[word] += count self.word_counts[category][word] += count self.update_prior_prob()
def add_text(self, text, tokenize, max_len): content_terms = helper.tokenize(text, tokenize) content_terms = content_terms if len( content_terms) <= max_len else content_terms[:max_len] content_terms = ['<s>'] + content_terms + ['<s>'] content_terms = ['#' + item + '#' for item in content_terms] for i in range(len(content_terms)): # create letter-trigrams word = content_terms[i] letter_trigrams_for_words = [] for j in range(0, len(word) - 2): letter_trigrams_for_words.append(word[j:j + 3]) self.letter_trigrams.append(letter_trigrams_for_words)
def imagenet_flickr8k_intersection(): path = "../data/Flickr8k_text/Flickr8k.token.txt" wordlist, worddictionary = tokenize( "../data/Flickr8k_text/Flickr8k.token.txt") worddf = dicttodf(worddictionary) path2 = "/Users/sebastiaanscholten/Documents/speech2image-master/vgsexperiments/experiments/data/imagenet_class_index.json" with open(path2, "r") as json_file: data = json.load(json_file) wordlist1 = list(worddf["words"]) wordlist2 = [] for words in data.values(): wordlist2.append(words[1]) return intersection(wordlist1, wordlist2)
def predict(self, data): words = tokenize(data) count_dict = count_words(words) p_abstract_given = self.init_p_abstract_given_category() for w, cnt in count_dict.items(): #if w in stopwords: # continue p_w_given = self.calc_p_w_given_category(w) for category in self.categories: p_abstract_given[category] *= p_w_given[category]**cnt posterior_prob = self.calc_posterior_prob(p_abstract_given) predicted_category = max(posterior_prob.iteritems(), key=operator.itemgetter(1))[0] return predicted_category
def write(input_string, output_path): with open(output_path, "w") as f: writer = csv.writer(f, delimiter="\t") tokens = helper.tokenize(input_string) normalizations = [] tokens_and_normalizations = [] for token in tokens: #normalizing and storing in list normalizations.append(helper.normalize(token)) for i in range(len(tokens)): #creating pairs and storing temporary_pair = [] #them in a nested list temporary_pair.append(tokens[i]) temporary_pair.append(normalizations[i]) tokens_and_normalizations.append(temporary_pair) writer.writerows(tokens_and_normalizations)
def add_text(self, text, tokenize, max_len): content_terms = helper.tokenize(text, tokenize) content_terms = content_terms if len( content_terms) <= max_len else content_terms[:max_len] for i in range(len(content_terms)): # create letter-trigrams word = content_terms[i] word_letter_n_grams = [] for j in range(1, self.order_n_gram + 1): if j > len(word): break else: # create letter_n_grams where n = j word_letter_n_grams.extend(self.find_letter_ngrams( word, j)) if word_letter_n_grams: self.letter_n_grams.append(word_letter_n_grams)
def write(input_string, output_path): with open(output_path, "w", newline="") as f: # To write .csv and .tsv, you first open a # file, then you call csv.writer() and give # it the file as an argument as seen below. # (It doesn't work if you haven't imported # the csv module above.) # writer = csv.writer(f) writer = csv.writer(f, delimiter="\t") # The csv.writer() method can either just take # one argument, the file it will write to, in # which case it looks just as above. It can, # however, take a second argument, and look # as follows: # csv.writer(f, delimiter=",") # which will tell the csv module explicitly # that we want to use , as a symbol to separate # the individual fields. # TODO: Change the csv.writer() call above so # that the csv module uses the tab as a # delimiter. Check exercise-5-readme.md # if you are unsure. # Then, you can use writer.writerows() to write # your .csv file. The writerows() function # takes as argument a list of lists. For example, # calling writerows( [ [ a , b ] , [ c , d ] ]) # will result in the following .csv file: # a,b # c,d # TODO: construct a list of lists in the # following form: # [ [ token1, normalised_form1 ] , # [ token2, normalised_form2 ] , # ... ] # using the helper.tokenize() and # helper.normalize() functions, then change # the below call to use your list of lists tokens = helper.tokenize(input_string) tokens_and_normalizations = [[token, helper.normalize(token)] for token in tokens] writer.writerows(tokens_and_normalizations)
def collect(): global chunks i = 0 with open("data/posts.csv", "r") as f: for post in csv.DictReader(f, fieldnames=["gender", "author", "body"]): if i % 100 == 0: print(i) i += 1 # quick tokenize func to ensure we don't divide by 0. if len(helper.tokenize(post["body"])) == 0: continue x = features.get_features(post["body"]) y = [post["gender"] == "male"] chunks.append(y + x) chunks = np.array(chunks) np.random.shuffle(chunks) np.save("data/chunks.npy", chunks)
def process_text(self, text, nummify=True, add_words_to_list=True, is_query=False): tokenized = helper.tokenize(text, self.text_processor_pipeline) if is_query: vec = [] for word in tokenized: if word in self.word_to_num: vec.append(self.word_to_num[word]) return vec else: for word in tokenized: self.add_word(word) if not nummify: return return self.nummify(tokenized)
def transform(entry, output): chunks = [] try: i = 0 with open(entry, "r") as f: for post in csv.DictReader(f, fieldnames=["gender", "author", "body"]): if i % 100 == 0: print(i) i += 1 # quick tokenize func to ensure we don't divide by 0. if len(helper.tokenize(post["body"])) == 0: continue x = features.get_features(post["body"]) y = [post["gender"] == "male"] chunks.append(y + x) chunks = np.array(chunks) np.random.shuffle(chunks) np.save(output, chunks) except KeyboardInterrupt: timestamp = datetime.now().timestamp() np.save(f"{output}_{timestamp}", chunks)
def __init__(self, content, max_len, tokenize=False): content_terms = helper.tokenize(content, tokenize) self.text = content_terms if len( content_terms) <= max_len else content_terms[:max_len] self.is_clicked = False
def question(self): """ Parses the user's most recent message, and decides what to do based on the content and the current status. """ name = self.name mess = self.memory.read("message") m = tokenize(mess) mess = " ".join(m) if mess == None: return "reset" # if the user wants to exit, then # return True (kill the session) if mess.startswith("exit") or \ "bye" in m or \ "goodbye" in m: self.bot.send("Glad to be of help :)", name) return "exit" # if the user says "nevermind", then # clear the session if mess.find("nevermind") != -1: self.bot.send("Ok.", name) self.clear() return "reset" # if the user greets Dodona, then respond # in kind. if "hi" in m or \ "hey" in m or \ "hello" in m != -1: self.bot.send("Hello, " + name + "!") return None # check the status, and return the corresponding # function if necessary s = self.memory.read("status") #if s == "unknown": return self.unknown(mess) if s == "learn": return self._learn() if s: if s.startswith("pos"): return self._part_of_speech(mess, s.split("_")[1]) d = self.memory.read("data") k = self.memory.read("topic") # if there is no current topic, then decipher one # from the most recent message. if k == None: self._AI(mess) if self.memory.read("topic"): return None else: return "reset" # if there is a current topic, search for a subtopic else: self._AI(mess, d, k) if self.memory.read("status") == "pos_first": return None else: self.memory.pop("topic") self.memory.pop("data") return "reset"
workers = cpu_count, iter = n_iterations) # training the word model word_model.build_vocab(text_clean) word_model.train(text_clean) # creating mapping dictionaries word_idx, idx_word, word_vec = helper.ind(vocab,word_model) # creating embedding matrix embed_weight = helper.embed_wt(vocab,word_idx,word_vec,vec_dim) # sequence sent_seq_parsed,sent_seq_parsed_pad = helper.tokenize(text_clean,word_idx,maxlen) # training data word_list = word_tokenize(text_clean) n_symbols = len(vocab)+1 sentences_list = [] for sentence in sent_seq_parsed_pad: for idx in sentence: sentences_list.append(idx) sentences_seq = [] next_word = [] for i in range(len(sentences_list)-maxlen): sentences_seq.append(sentences_list[i: i + maxlen]) next_word.append(sentences_list[i + maxlen])
def add_text(self, text, tokenize, max_length): content_terms = helper.tokenize(text, tokenize) if len(content_terms) > max_length: self.query_terms = ['<s>'] + content_terms[:max_length] + ['</s>'] else: self.query_terms = ['<s>'] + content_terms + ['</s>']
sep='\t', names=['label', 'body_text'], header=None) data.columns = ['label', 'body_text'] #START DATA PREPROCESSING data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(' ')) data['punct%'] = data['body_text'].apply(lambda x: count_punctuation(x)) data['body_text_clean'] = data['body_text'].apply( lambda x: remove_punctuation(x)) data['body_text_tokenized'] = data['body_text_clean'].apply( lambda x: tokenize(x)) stopwords = nltk.corpus.stopwords.words('english') data['body_text_nonstop'] = data['body_text_tokenized'].apply( lambda x: remove_stopwords(x, stopwords)) stemmer = nltk.PorterStemmer() data['body_text_stemmed'] = data['body_text_nonstop'].apply( lambda x: stemming(x, stemmer)) wn = nltk.WordNetLemmatizer() data['body_text_lemmatized'] = data['body_text_nonstop'].apply( lambda x: lemmatizing(x, wn))
def __init__(self, text, max_len, tokenize=False): content_terms = helper.tokenize(text, tokenize) self.text = content_terms if len( content_terms) <= max_len else content_terms[:max_len] self.rel_docs = []
epoch = args.epoch print_nsteps = args.print_nsteps verbose = args.verbose ckpt_prefix = args.ckpt_prefix batch_size = hparams['batch_size'] if os.path.isdir(ckpt_prefix) == False: os.mkdir(ckpt_prefix) ckpt_path = os.path.join(ckpt_prefix, 'model.ckpt') tgt_sentences, tgt_metadata = data_pipeline(args.tgt_dataset, padding=True) src_sentences, src_metadata = data_pipeline(args.src_dataset, padding=True) src_inputs = np.array([tokenize(sentence, src_metadata, source=True, reverse=True)\ for sentence in src_sentences]) tgt_outputs = np.array([tokenize(sentence, tgt_metadata, source=False, reverse=False)\ for sentence in tgt_sentences]) save_metadata(tgt_metadata, "tgt_metadata.dill") save_metadata(src_metadata, "src_metadata.dill") hparams['tgt_vocab_size'] = tgt_metadata.vocab_size hparams['src_vocab_size'] = src_metadata.vocab_size hparams['dec_max_time_step'] = tgt_metadata.max_time_step save_hparams(json_path, hparams) train_graph = tf.Graph() eval_graph = tf.Graph()
def add_text(self, text, tokenize): content_terms = helper.tokenize(text, tokenize) for i in range(len(content_terms)): term = '#' + content_terms[i] + '#' for j in range(0, len(term) - 2): self.query_terms.append(term[j:j + 3])
parser.add_argument("-ckpt_prefix", "--ckpt_prefix", nargs="?", help="checkpoint path prefix", type=str) parser.add_argument('-hparams', "--hparams", help="path to hyperparameters file (.json)", type=str) args = parser.parse_args() src_metadata = load_metadata('src_metadata.dill') tgt_metadata = load_metadata('tgt_metadata.dill') src_inputs = np.array([tokenize(sentence, src_metadata, source=True, reverse=True)\ for sentence in src_sentences]) fp = open(args.hparams, 'r') hparams = json.load(fp) ckpt_path = os.path.join(args.ckpt_prefix, 'model.ckpt') infer_graph = tf.Graph() with infer_graph.as_default(): infer_model = Model("infer", hparams) with tf.Session(graph=infer_graph) as sess: sess.run(tf.global_variables_initializer()) infer_model.saver.restore(sess, ckpt_path) while True:
# load gram_counts gram_count = {} with open(args["gramcnt"], "r") as F: for line in F: gram_string, count = line.split(",") count = int(count) if gram_string in gram_label: gram_count[gram_string] = count print "Finish Loading Gram Counts." # dump word_embeddings cPickle.dump(embedding, open(join(args["output"], "sentiment_custom_We"), 'wb')) for file_name in ['train-rootfine', 'dev-rootfine', 'test-rootfine']: output = [] with open(join(args["input"], file_name), "r") as F: for line in F: vector = line.split(" ") label = vector[0] sentence = sanitize_line( filter_with_alphabet(" ".join(vector[1:]), alphabet)) for i in range(repeat): tokenized_sentence = tokenize(gram_count, sentence, gram_length, token_weight) for j in range(len(tokenized_sentence)): tokenized_sentence[j] = gram_label.get( tokenized_sentence[j], 0) # 0 is the, not meaningful output.append([tokenized_sentence, label]) cPickle.dump(output, open(join(args["output"], file_name), 'wb')) print "Finish Dumping File " + file_name