def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) lastid = None for line in file: if self.print_status: lc.update("Posbigram Sent Stream") data = json.loads(line) xml = data['annotation'] id = data['id'] if lastid != id: para_num = 0 else: para_num += 1 lastid = id token_list = mf.xml2words(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: yield id, para_num, utils.makeBigrams(pos_cleaned)
def nlp_to_doc_token(annotation, token_type, clean=True, lower=False, bigrams=False, dictionary=None): sentences = annotation['sentences'] abs_list = list() for sentence in sentences: pos_list = list() token_list = list() for token in sentence['tokens']: pos_list.append(token['pos']) # oText = token['originalText'] if lower: token_list.append(token[token_type].lower()) else: token_list.append(token[token_type]) if clean: token_list, pos_cleaned = utils.posFilterString(token_list, pos_list) if dictionary is not None: token_list = [word for word in token_list if word in dictionary.token2id] if bigrams: token_list = utils.makeBigrams(token_list) abs_list.extend(token_list) return abs_list
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) abs_list = [] lastid = None for line in file: if self.print_status: lc.update("Posbigram Doc Stream") data = json.loads(line) doc_id = data['id'] xml = data['annotation'] if lastid != doc_id and len(abs_list) > 0: # Yield Stuff yield lastid, abs_list abs_list = [] lastid = doc_id token_list = mf.xml2words(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) pos_cleaned = utils.makeBigrams(pos_cleaned) if len(pos_cleaned) > 0: for j in range(0, len(pos_cleaned)): abs_list.append(pos_cleaned[j]) if len(abs_list) > 0: # Yield Stuff yield doc_id, abs_list
def get_feature_vector(self, word_corpus, pos_corpus): sent_infos = list() max_sent = 0 sent_id = 0 for words, pos in zip(word_corpus, pos_corpus): sent_id += 1 max_sent += 1 wordbigram = makeBigrams(words) posbigram = makeBigrams(pos) word_bow = self.word_dic.doc2bow(words) vec_word_tfidf = self.word_tfidf[word_bow] wordbigram_bow = self.wordbigram_dic.doc2bow(wordbigram) vec_wordbigram_tfidf = self.wordbigram_tfidf[wordbigram_bow] pos_bow = self.pos_dic.doc2bow(pos) vec_pos_tfidf = self.pos_tfidf[pos_bow] posbigram_bow = self.posbigram_dic.doc2bow(posbigram) vec_posbigram_tfidf = self.posbigram_tfidf[posbigram_bow] # Collecting Concreteness-Ratings cr_min = 1000 cr_max = 0 cr_mean = 0 cr_words = [ cr_word for cr_word in words if cr_word in self.conc_rating ] for word in cr_words: rating = self.conc_rating[word] cr_mean += rating if rating > cr_max: cr_max = rating if rating < cr_min: cr_min = rating if cr_min > cr_max: cr_max_feature = 0 cr_min_feature = 0 cr_mean_feature = 0 else: cr_mean = cr_mean / len(cr_words) cr_max_feature = cr_max cr_min_feature = cr_min cr_mean_feature = cr_mean sent_info = dict() sent_info['cr_max_feature'] = cr_max_feature sent_info['cr_min_feature'] = cr_min_feature sent_info['cr_mean_feature'] = cr_mean_feature sent_info['vec_word_tfidf'] = vec_word_tfidf sent_info['vec_wordbigram_tfidf'] = vec_wordbigram_tfidf sent_info['vec_pos_tfidf'] = vec_pos_tfidf sent_info['vec_posbigram_tfidf'] = vec_posbigram_tfidf sent_info['sent_id'] = sent_id sent_infos.append(sent_info) feature_data_array = [] feature_row = [] feature_col = [] row_count = 0 for feature_data in sent_infos: sid = feature_data['sent_id'] vector_offset = 0 if 'location' in self.feature_set: feature_row.append(row_count) feature_col.append(vector_offset) feature_data_array.append(sid / max_sent) vector_offset += 1 if 'concreteness' in self.feature_set: cr_max_feature = float(feature_data['cr_max_feature']) cr_min_feature = float(feature_data['cr_min_feature']) cr_mean_feature = float(feature_data['cr_mean_feature']) feature_row.append(row_count) feature_col.append(vector_offset) feature_data_array.append(cr_max_feature) feature_row.append(row_count) feature_col.append(vector_offset + 1) feature_data_array.append(cr_min_feature) feature_row.append(row_count) feature_col.append(vector_offset + 2) feature_data_array.append(cr_mean_feature) vector_offset += 3 if 'wordunigram' in self.feature_set: append_vec2data(feature_data['vec_word_tfidf'], feature_data_array, feature_row, feature_col, row_count, vector_offset) vector_offset += self.word_vec_len if 'wordbigram' in self.feature_set: append_vec2data(feature_data['vec_wordbigram_tfidf'], feature_data_array, feature_row, feature_col, row_count, vector_offset) vector_offset += self.wordbigram_vec_len if 'posunigram' in self.feature_set: append_vec2data(feature_data['vec_pos_tfidf'], feature_data_array, feature_row, feature_col, row_count, vector_offset) vector_offset += self.pos_vec_len if 'posbigram' in self.feature_set: append_vec2data(feature_data['vec_posbigram_tfidf'], feature_data_array, feature_row, feature_col, row_count, vector_offset) vector_offset += self.posbigram_vec_len row_count += 1 feature_row = np.array(feature_row) feature_col = np.array(feature_col) feature_data_array = np.array(feature_data_array) feature_vector = scipy.sparse.csc_matrix( (feature_data_array, (feature_row, feature_col)), shape=(row_count, self.vector_len)) return feature_vector
df = wordDF.join(lemmaDF).join(fineposDF).join(coarseposDF).join( mergedwordDF) # .join(wordlowermergedDF) for i in range(len(token_types)): token_type = token_types[i] dic_path = dic_paths[i] is_bigram = False if "bigram" in token_type: is_bigram = True token_type = token_type[:-6] corpus = list() print(f"Build Corpus for {token_type} - Bigram: {is_bigram}") for abstract_id, row in df.iterrows(): token_string = row[token_type] tokens = token_string.replace("\t\t", "\t").split("\t") if is_bigram: tokens = makeBigrams(tokens) corpus.append(tokens) print("Build Dictionary") dictionary = gensim.corpora.Dictionary() dictionary.add_documents(corpus, prune_at=None) print("Save Dictionary") dictionary.save(dic_path) print(dictionary)
feature_col, row_count, vector_offset) vector_offset += posbigram_vec_len row_count += 1 max_sent = 0 sent_infos.clear() last_abstract_id = abstract_id max_sent += 1 label_key = (abstract_id, sent_id) if (label_key in label_dic) and (label_count[label_dic[label_key]] < label_limit): wordbigram = utils.makeBigrams(word_tokens) posbigram = utils.makeBigrams(pos_tokens) word_bow = word_dic.doc2bow(word_tokens) vec_word_tfidf = word_tfidf[word_bow] wordbigram_bow = wordbigram_dic.doc2bow(wordbigram) vec_wordbigram_tfidf = wordbigram_tfidf[wordbigram_bow] pos_bow = pos_dic.doc2bow(pos_tokens) vec_pos_tfidf = pos_tfidf[pos_bow] posbigram_bow = posbigram_dic.doc2bow(posbigram) vec_posbigram_tfidf = posbigram_tfidf[posbigram_bow] # Collecting Concreteness-Ratings cr_min = 1000
def build_feature_file(dtype): dictionary_dir = os.path.join( dirname, '../../data/processed/' + dtype + '/dictionaries') tfidf_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/tfidf') feature_file = os.path.join( dirname, '../../data/processed/' + dtype + '/features/ap_features.json') if os.path.isfile(feature_file): os.remove(feature_file) word_dic = gensim.corpora.Dictionary.load( os.path.join(dictionary_dir, 'word.dic')) wordbigram_dic = gensim.corpora.Dictionary.load( os.path.join(dictionary_dir, 'wordbigram.dic')) pos_dic = gensim.corpora.Dictionary.load( os.path.join(dictionary_dir, 'pos.dic')) posbigram_dic = gensim.corpora.Dictionary.load( os.path.join(dictionary_dir, 'posbigram.dic')) word_tfidf = gensim.models.TfidfModel.load( os.path.join(tfidf_dir, 'words_model.tfidf')) wordbigram_tfidf = gensim.models.TfidfModel.load( os.path.join(tfidf_dir, 'wordbigrams_model.tfidf')) pos_tfidf = gensim.models.TfidfModel.load( os.path.join(tfidf_dir, 'pos_model.tfidf')) posbigram_tfidf = gensim.models.TfidfModel.load( os.path.join(tfidf_dir, 'posbigrams_model.tfidf')) conc_rating = load_concratings() word_corpus = corpora.word_sent_stream(dtype) pos_corpus = corpora.pos_sent_stream(dtype) with open(feature_file, "a") as featfile: information = {} information['word_vec_len'] = len(word_dic) information['wordbigram_vec_len'] = len(wordbigram_dic) information['pos_vec_len'] = len(pos_dic) information['posbigram_vec_len'] = len(posbigram_dic) json_line = json.JSONEncoder().encode(information) featfile.write(json_line + '\n') sent_infos = [] last_doc_id = None lt = LoopTimer(update_after=100) for word_sent, pos_sent in zip(word_corpus, pos_corpus): if word_sent[0] != pos_sent[0]: # Checking if ids are the same continue doc_id = word_sent[0] pid = word_sent[1] words = word_sent[2] pos = pos_sent[2] wordbigrams = utils.makeBigrams(words) posbigrams = utils.makeBigrams(pos) word_bow = word_dic.doc2bow(words) vec_word_tfidf = word_tfidf[word_bow] wordbigram_bow = wordbigram_dic.doc2bow(wordbigrams) vec_wordbigram_tfidf = wordbigram_tfidf[wordbigram_bow] pos_bow = pos_dic.doc2bow(pos) vec_pos_tfidf = pos_tfidf[pos_bow] posbigram_bow = posbigram_dic.doc2bow(posbigrams) vec_posbigram_tfidf = posbigram_tfidf[posbigram_bow] # Collecting Concreteness-Ratings cr_min = 1000 cr_max = 0 cr_mean = 0 cr_words = [cr_word for cr_word in words if cr_word in conc_rating] for word in cr_words: rating = conc_rating[word] cr_mean += rating if rating > cr_max: cr_max = rating if rating < cr_min: cr_min = rating if cr_min > cr_max: cr_max_feature = 0 cr_min_feature = 0 cr_mean_feature = 0 else: cr_mean = cr_mean / len(cr_words) cr_max_feature = cr_max cr_min_feature = cr_min cr_mean_feature = cr_mean if (last_doc_id is not None) and (last_doc_id != doc_id): max_sent = len(sent_infos) for sent_info in sent_infos: sent_info['max_sent'] = max_sent json_line = json.JSONEncoder().encode(sent_info) featfile.write(json_line + '\n') sent_infos.clear() sent_info = dict() sent_info['cr_max_feature'] = cr_max_feature sent_info['cr_min_feature'] = cr_min_feature sent_info['cr_mean_feature'] = cr_mean_feature sent_info['vec_word_tfidf'] = vec_word_tfidf sent_info['vec_wordbigram_tfidf'] = vec_wordbigram_tfidf sent_info['vec_pos_tfidf'] = vec_pos_tfidf sent_info['vec_posbigram_tfidf'] = vec_posbigram_tfidf sent_info['id'] = doc_id sent_info['paragraphID'] = pid sent_info['sent_id'] = len(sent_infos) sent_infos.append(sent_info) last_doc_id = doc_id lt.update("Build AP Features") max_sent = len(sent_infos) for sent_info in sent_infos: sent_info['max_sent'] = max_sent json_line = json.JSONEncoder().encode(sent_info) featfile.write(json_line + '\n')