def indexit(tokenizer, filenames): indexer = Indexer(used_tokenizer) for filename in filenames: corpus_reader = CorpusReader(filename) indexer.index(corpus_reader) indexer.sort() return indexer
def main(): args = parse_args() r = CorpusReader(accent_map=args.accents, filter_punct=args.filter_punct, lower=args.lower) featdict, labels = r.get_featdict_from_lines(stdin, window=args.window) vec = DictVectorizer() X = vec.fit_transform(featdict).toarray() y, label_d = convert_labels(labels) cnt = defaultdict(int) # for l in y: # cnt[label_d[l]] += 1 # for k, v in cnt.iteritems(): # print('{0} {1}'.format(k.encode('utf8'), v)) #print label_d #print(vec.fit_transform(featdict).toarray()) #print vec.get_feature_names() run_pipeline(X, y)
def index(self, corpus_reader: CorpusReader): update = self.__index.update for pmid, document in corpus_reader.items(): update(pmid, document) self.__documents.setdefault(pmid, 0) if self.__process.memory_percent() >= self.__max_memory_usage: self.__dispatch()
def __init__( self, files=[], directories=[], skip=[], unigram_dictionary=None, noise_ratio=15, kernel=[1,2,3,4,5,5,4,3,2,1], t = 1.0e-5, batch_size = 1000, parse=default_parse, verbose=True ): # Get a corpus reader self.corpus_reader = CorpusReader( files=files, directories=directories, skip=skip, parse=parse, verbose=verbose ) # Load the unigram_dictionary if unigram_dictionary is not None: self.unigram_dictionary = unigram_dictionary else: self.unigram_dictionary = UnigramDictionary() self.noise_ratio = noise_ratio self.kernel = kernel self.t = t self.batch_size = batch_size # Validate the kernel. It should reflect the relative # frequencies of choosing tokens from a window of +/- K tokens # relative to a query token. So it must have an even number of # entries if not len(self.kernel) % 2 == 0: raise ValueError( 'kernel should reflect the relative frequencies of ' 'selecting a context token within +/- K of the query ' 'token, and so should have an equal number of entries ' 'defining frequencies to the left and right of the query ' 'token, and so should have an even number of entries.' )
def __init__(self, path, corpus_path=''): self.base_path = os.path.expanduser(path) self.models_path = os.path.join(self.base_path, "models") if corpus_path == '': corpus_path = os.path.join(self.base_path, "corpus") self.corpus_path = corpus_path self.filters = {'language' : 'english'} os.makedirs(self.base_path, exist_ok=True) os.makedirs(self.models_path, exist_ok=True) os.makedirs(self.corpus_path, exist_ok=True) self.cr = CorpusReader(corpus_path)
def indexit(tokenizer, filenames, store_positions=False, calculate_tfidf=False, memory_usage=20): index = Index(tokenizer, store_positions) indexer = Indexer(index, 'index', max_memory_usage=memory_usage) for filename in filenames: indexer.index(CorpusReader(filename)) indexer.merge(calculate_tfidf) return index
def run(self): corpus_reader = CorpusReader(self.path) corpus_reader.load() analyser = SentimentIntensityAnalyzer() num_subs = len(corpus_reader.subjects) for i, sub in enumerate(corpus_reader.subjects): print(f"Number os subjects left : {num_subs - i}") for post in sub.posts: score = analyser.polarity_scores(str(post)) s = score['compound'] if abs(s) > self.threshold: string = spplit(str(post)) for j in range(3): for i in range(len(string) - j): score_word = analyser.polarity_scores(' '.join( string[i:(i + j)])) word_compound = score_word['compound'] if abs(word_compound) > self.threshold: if string[i] not in self.imp_words: self.imp_words.append(' '.join( string[i:(i + j)]))
def indexit(tokenizer, filenames, store_positions=False, calculate_tfidf=False, memory_usage=20): indexer = Indexer(tokenizer, 'indexer', store_positions=store_positions, max_memory_usage=memory_usage) for filename in filenames: corpus_reader = CorpusReader(filename) indexer.index(corpus_reader) indexer.merge(calculate_tfidf) return indexer
def main(): path = os.path.join('..', '..', 'dataset', 'eRISK2020_T1_training_data', 'td') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path) corpus_reader_train.load() print("Corpus Reader for training created") path = os.path.join('..', '..', 'dataset', 'T1_test_data', 'td') gt_name = 'T1_erisk_golden_truth.txt' corpus_reader_test = CorpusReader(path, gt_name) corpus_reader_test.load() all_texts = [ ''.join(map(lambda x: str(x), subject.posts)) for subject in corpus_reader_train.subjects ] all_gt = [subject.gt for subject in corpus_reader_train.subjects] count_vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\w+', ngram_range=(1, 2)) bow = dict() bow["train"] = (count_vectorizer.fit_transform(all_texts), all_gt) lr_classifier = LogisticRegression(solver='liblinear') lr_classifier.fit(*bow["train"]) matrix = Matrix(len(corpus_reader_test.subjects), corpus_reader_test.subjects) args = {'matrix': matrix, 'vec': count_vectorizer, 'class': lr_classifier} matrix = run_simulation(args) print(matrix) # analyze results precision = measures.calc_precision(corpus_reader_test.subjects, matrix) recall = measures.calc_recall(corpus_reader_test.subjects, matrix) f1 = measures.calc_f1(precision, recall) ERDE = measures.calc_ERDE(corpus_reader_test.subjects, matrix)
from corpus_reader import CorpusReader from preprocess import PreProcess from tf_idf import TfIdf from knn import KNN from metrics import MetricsGenerator from pprint import pprint as pp if __name__ == '__main__': print('reading...') reader = CorpusReader() reader.run() parser = PreProcess() parsed_trainning_documents = {} print('processing...') for k, v in reader.train.items(): parsed_trainning_documents[k] = parser.process(v) # Entrada para o tf-idf, devemos anotar os documentos com suas classes. # Receberá como entrada um array de tuplas: ([tokens], classe) parsed_trainning_documents_with_classes = [] for k in parsed_trainning_documents.keys(): parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]] # Execução tf-idf print('generating tf.idf...') tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes) tf_idf_calculator.run() # testa os parâmetros do knn: métrica de distância e valor de K for metric in ['cosine', 'euclid']:
def train6(): with open("log.txt", 'w') as f: pass #path1 = os.path.join( '..', '..', 'dataset', 'eRISK2020_T1_training_data', 'train') #path1 = os.path.join( '..', 'data', 'erisk-2021-t2', 'td') path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token() """ set the tokenizer and model parameters """ #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #bert_model = BertModel.from_pretrained("bert-base-uncased") bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') #device = torch.device("cuda") #bert_model.to(device) # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") #n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0 parameters = { 'classifier__n_estimators':[50, 100, 500, 1000], 'classifier__learning_rate' : [ 0.001, 0.01, 0.1, 1.0], 'classifier__max_depth' : [1, 3, 5, 10]} classifier = GradientBoostingClassifier() model = Pipeline( [ ('emojis', emo), #('tokenizer', token), ('union', FeatureUnion(transformer_list = [ ("vectorizer", bert_transformer), ("sentiment", sentiment), ])), ("classifier", classifier), ] ) clf = GridSearchCV(model, parameters) batch_size = 40 num_users = len(corpus_reader_train.subjects) #print(num_users) for j in range(50, 2000, 50): count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): #print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 print(all_gt[0]) for i in range(len(all_texts)): clf.fit(all_texts[i], all_gt[i]) num_users = len(corpus_reader_test.subjects) #print(num_users) for j in range(50, 2000, 50): all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): clf.fit(all_texts[i], all_gt[i]) print("End of training") return clf
__author__ = 'rwechsler' import gensim import sys import glob from corpus_reader import CorpusReader files = glob.glob(sys.argv[1]) outfile_name = sys.argv[2] dataset = CorpusReader(files) model = gensim.models.Word2Vec(dataset, size=500, window=5, min_count=3, negative=5, workers=2) model.save(outfile_name)
""" data_path_train = os.path.join(args.data_dir, "train.csv.pkl") data_path_test = os.path.join(args.data_dir, "test.csv.pkl") model_name = args.model_name emb_path = args.emb_path save_dir = args.save_dir logging.info("loading word emb") word2idx, embedding_matrix = GloveEmbeddings.get_embeddings_with_custom_tokens( path=emb_path, embedding_dim=dim) vocab_size = len(word2idx) logging.info("word emb loaded: {}".format(vocab_size)) logging.info("loading dataset") X_train, Y_train, X_dev, Y_dev, X_test, _ = CorpusReader.get_question_pair_data( data_path_train, data_path_test) """ trim the test set is desired """ if args.size_test_set: X_test = X_test[:args.size_test_set] assert (len(X_train) == len(Y_train)), "Train data and label size mismatch" logging.info("train size: {}, test size: {}, dev size: {}".format( len(X_train), len(X_test), len(X_dev))) logging.info("loaded dataset") list_classes = ["0", "1"] model = QuestionPairDecomposableAttModelTF( v=vocab_size, d=dim,
("../../corpus/corpus_cine","*.xml","<body>(.*?)</body>","rank=\"(.*?)\"","FILE", "BEFORE",None,0,'utf8',{u'1': 0, u'2': 25, u'3': 50, u'4': 75, u'5': 100}), ("../../corpus/corpus_hoteles","*.xml","<coah:review>(.*?)</coah:review>","<coah:rank>(.*?)</coah:rank>","FILE", "BEFORE",None,0,'utf8',{u'1': 0, u'2': 25, u'3': 50, u'4': 75, u'5': 100}), ("../../corpus/corpus_prensa_uy","*.csv","\"(.*?)\",(?:TRUE|FALSE)",",(.*?)\\n","FILE", "AFTER",None,0,'utf8',{u'Neg': 0, u'Neu': 50, u'Pos': 100}), ("../../corpus/corpus_tweets","*.tsv","(.*?)\\t.*?\\n","(.*?\\t.*?)\\t","FILE", "BEFORE",None,1,'utf8',{u'3\t1': 10, u'3\t2': 20, u'2\t4': 90, u'2\t2': 70, u'2\t3': 60, u'4\t2': 30, u'2\t1': 80, u'5\t1': 40, u'1\t5': 50, u'1\t4': 30, u'4\t1': 50, u'1\t1': 40, u'1\t3': 60, u'1\t2': 70}), ("../../corpus/corpus_variado_sfu","*/*.txt","(.*)\s","(.*?)_","PATH", None,1,0,'utf8',{'no': 0, 'yes': 100}) ] # Read each corpus from corpus_reader import CorpusReader for parameter in parameters: reader = CorpusReader( parameter[0], parameter[1], parameter[2], parameter[3], parameter[4], category_position=parameter[5], category_level=parameter[6], start=parameter[7], decoding=parameter[8], ) fun = parameter[9] data = reader.get_data(lambda x:fun[x])
def get_input_option(prompt, options): res = input(prompt + " (" + "/".join(options) + ") ") while res not in options: res = input("pardon? (" + "/".join(options) + ") ") return res if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Corpus Filter') arg_parser.add_argument('corpus_file', help='path to the corpus file') arg_parser.add_argument('output_prefix', help='path to the output files') args = arg_parser.parse_args() print('\n - Filtering Corpus -\n') corpus = CorpusReader(args.corpus_file) file_output_pos = open(args.output_prefix + '.pos', 'w', encoding='utf8') file_output_neg = open(args.output_prefix + '.neg', 'w', encoding='utf8') file_output_fav = open(args.output_prefix + '.fav', 'w', encoding='utf8') for tweet in corpus.text_json(): tweet = tweet.replace('\n', ' ') tweet = tweet.strip() print('"' + tweet + '"') action = get_input_option('sarcasm detected?', ['y', 'n', 'f', 'q']) if action == 'f': file_output_fav.write(tweet + '\n') action = get_input_option('faved, but is there sarcasm?', ['y', 'n', 'q']) if action == 'y': file_output_pos.write(tweet + '\n')
#mod3 = pickle.load(open(MODEL3_NAME, 'rb')) #mod4 = pickle.load(open(MODEL4_NAME, 'rb')) #mod5 = pickle.load(open(MODEL5_NAME, 'rb')) #mod6 = pickle.load(open(MODEL6_NAME, 'rb')) #mod7 = pickle.load(open(MODEL7_NAME, 'rb')) #device = torch.device("cuda") #no_vader.to(device) path = os.path.join( '..', 'data', 'erisk-2021-t2') #path = os.path.join( '..', '..', 'dataset', 'T1_test_data', 'test') gt_name = 'golden_truth.txt' corpus_reader_test = CorpusReader(path) corpus_reader_test.load() with open("file.txt", 'w') as f: for sub in corpus_reader_test.subjects: f.write("{} - {}\n".format(sub.id, sub.gt)) filename = "RESULTS_TEST_more_model3_no_token_param.txt" #clean file with open(filename, 'w') as file: pass # find the greatest number of posts posts_max = max([ len(s.posts) for s in corpus_reader_test.subjects ]) print(posts_max)
def __init__( self, max_len=50, # Maximum sentence length, same for questions, answers and reviews num_reviews=20, # Number of review candidates for each QA pair selftest=False, if_only_top_ans=True, top_score_recorder=None, load_meta=True, load_vocab=True, load_qa=True, load_review=True, load_word_embedding=True): try: # if not selftest: # filename = os.path.join(DATA_PATH, 'datautil.pickle') # else: # filename = os.path.join(DATA_PATH, 'datautil-selftest.pickle') # logger.info('Loading stored data from {} ...'.format(filename)) # with open(filename, 'rb') as f: # tmp_dict = pickle.load(f) # self.__dict__.clear() # self.__dict__.update(tmp_dict) self.selftest = selftest if load_meta: self._load_meta() if load_vocab: self._load_vocab() if load_qa: self._load_qa() if load_review: self._load_review() if load_word_embedding: self._load_word_embedding() except IOError: logger.info('Stored data not found, preprocessing ...') self.selftest = selftest self.max_len = max_len self.num_reviews = num_reviews logger.info('Initializing CorpusReader ...') corpusreader = CorpusReader( maxline=SELF_TEST_MAX_LINE if selftest else -1, num_reviews=(5 * self.num_reviews), if_only_top_ans=if_only_top_ans, load_glove=False if selftest else True) self.vocab_size = corpusreader.vocab_size self.num_pos_tags = corpusreader.num_pos_tags self.embed_matrix = corpusreader.embed_matrix self.w_embed_size = corpusreader.w_embed_size self.word2id = corpusreader.word2id self.id2word = corpusreader.id2word self.id2freq = corpusreader.id2freq self.pos2id = corpusreader.pos2id self.id2pos = corpusreader.id2pos logger.info('Read corpus data and convert to arrays ...') data, review_data, asin2id = self._read_into_arrays( corpusreader=corpusreader, if_only_top_ans=if_only_top_ans) self.review_data = review_data del corpusreader del review_data gc.collect() logger.info('Calculate review IDF ...') self.review_idf = self._get_review_idf() logger.info('Splitting data into train, dev, test sets ...') self._train_idx, self._dev_idx, self._test_idx = [], [], [] self._train_size, self._dev_size, self._test_size = 0, 0, 0 self._data_split(data) del data gc.collect() # logger.info('Storing into {}...'.format(filename)) # with open(filename, 'wb') as f: # pickle.dump(self.__dict__, f) self._save_meta() self._save_vocab() self._save_qa() self._save_review() self._save_word_embedding() self._block_to_dense() self.top_score_recorder = top_score_recorder if self.top_score_recorder is not None: logger.info("Train with Pseudo Relevance Feedbacks") self._print_info()
def train_model1(classifier): path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token("normal") """ set the tokenizer and model parameters """ #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #bert_model = BertModel.from_pretrained("bert-base-uncased") bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') #device = torch.device("cuda") #bert_model.to(device) # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True) #clf = CalibratedClassifierCV(classifier) #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True) #classifier = AdaBoostClassifier(learning_rate = 0.01, n_estimators = 100) #clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) model = Pipeline([ ('emojis', emo), ('tokenizer', token), ( 'union', FeatureUnion(transformer_list=[ ("vectorizer", bert_transformer), #("sentiment", sentiment), ])), ("classifier", classifier), ]) batch_size = 40 num_users = len(corpus_reader_train.subjects) #print(num_users) for j in range(50, 2000, 50): count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): #print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_train.subjects[( batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[( batch_size * count):(batch_size * (count + 1))] ]) count += 1 print(all_gt[0]) for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) num_users = len(corpus_reader_test.subjects) #print(num_users) for j in range(50, 2000, 50): all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_test.subjects[( batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[( batch_size * count):(batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) print("End of training") # Its important to use binary mode dbfile = open(f'model1_{classifier.__class__.__name__}.sav', 'wb') pickle.dump(model, dbfile) return model
def add_langauge(self, pattern, response_pattern, language=ENGLISH): self._response = response_pattern CorpusReader.add_langauge(self, pattern, language)
class MinibatchGenerator(object): NOT_DONE = 0 DONE = 1 def __init__( self, files=[], directories=[], skip=[], unigram_dictionary=None, noise_ratio=15, kernel=[1,2,3,4,5,5,4,3,2,1], t = 1.0e-5, batch_size = 1000, parse=default_parse, verbose=True ): # Get a corpus reader self.corpus_reader = CorpusReader( files=files, directories=directories, skip=skip, parse=parse, verbose=verbose ) # Load the unigram_dictionary if unigram_dictionary is not None: self.unigram_dictionary = unigram_dictionary else: self.unigram_dictionary = UnigramDictionary() self.noise_ratio = noise_ratio self.kernel = kernel self.t = t self.batch_size = batch_size # Validate the kernel. It should reflect the relative # frequencies of choosing tokens from a window of +/- K tokens # relative to a query token. So it must have an even number of # entries if not len(self.kernel) % 2 == 0: raise ValueError( 'kernel should reflect the relative frequencies of ' 'selecting a context token within +/- K of the query ' 'token, and so should have an equal number of entries ' 'defining frequencies to the left and right of the query ' 'token, and so should have an even number of entries.' ) def get_vocab_size(self): ''' Get the size of the vocabulary. Only makes sense to call this after MinibatchGenerator.prepare() has been called, or if an existing (pre-filled) UnigramDictionary was loaded, since otherwise it would just return 0. ''' # Delegate to the underlying UnigramDictionary return len(self.unigram_dictionary) def load(self, directory): ''' Load the unigram_dictionary whose files are stored in <directory>. ''' # Delegate to the underlying UnigramDictionary self.unigram_dictionary.load(directory) def save(self, directory): ''' Save the unigram_dictionary to <directory>. ''' # Delegate to the underlying UnigramDictionary self.unigram_dictionary.save(directory) def check_access(self, savedir): savedir = os.path.abspath(savedir) path, dirname = os.path.split(savedir) # Make sure that the directory we want exists (make it if not) if not os.path.isdir(path): raise IOError('%s is not a directory or does not exist' % path) if not os.path.exists(savedir): os.mkdir(os.path) elif os.path.isfile(savedir): raise IOError('%s is a file. % savedir') # Make sure we can write to the file f = open(os.path.join(savedir, '.__test-w2v-access'), 'w') f.write('test') f.close os.remove(os.path.join(savedir, '.__test-w2v-access')) def prepare(self, savedir=None, min_frequency=None): ''' Iterate over the entire corpus in order to build a UnigramDictionary. We need this because we need to sample from the unigram distribution in producing minibatches. Optionally prune all tokens that occur fewer than min_frequency times from dictionary. Use min_frequency=None (the default) to specify no pruning. Optionally save the dictionary to savedir (this is done after pruning if pruning is requested). ''' # Before doing anything, if we were requested to save the # dictionary, make sure we'll be able to do that (fail fast) if savedir is not None: self.check_access(savedir) # Read through the corpus, building the UnigramDictionary for line in self.corpus_reader.read_no_q(): self.unigram_dictionary.update(line) # Prune the dictionary, if requested to do so. if min_frequency is not None: self.unigram_dictionary.prune(min_frequency) # Save the dictionary, if requested to do so. if savedir is not None: self.save(savedir) def prune(self, min_frequency=5): ''' Exposes the prune function for the underlying UnigramDictionary ''' self.unigram_dictionary.prune(min_frequency) def __iter__(self): # Once iter is called, a subprocess will be started which # begins generating minibatches. These accumulate in a queue # and iteration pulls from that queue. That way, iteration # can begin as soon as the first minibatch is prepared, and # later minibatches are prepared in the background while earlier # minibatches are used. The idea is that this will keep the # CPU(s) busy while training occurs on the GPU self.minibatches = Queue() self.recv_pipe, send_pipe = Pipe() # We'll fork a process to assemble minibatches, and return # immediatetely so that minibatches can be used as they are # constructed. # # Because we assemble the batches within a forked process, it's # access to randomness doesn't alter the state of the parent's # random number generator. Multiple calls to this function # would produce the same set of random samples, which is not # desired. We make a call to the numpy random number generator # to advance the parent's random number generator's state to avoid # this problem: np.random.uniform() minibatch_preparation = Process( target=self.enqueue_minibatches, args=(self.minibatches, send_pipe) ) minibatch_preparation.start() return self def init_batch(self): # Initialize np.array's to store the minibatch data. We know # how big the batch is ahead of time. Initialize by filling # the arrays with UNK tokens. Doing this means that, at the end # of the corpus, when we don't necessarily have a full minibatch, # the final minibatch is padded with UNK tokens in order to be # of the desired shape. This has no effect on training, because # we don't care about the embedding of the UNK token signal_batch = np.full( (self.batch_size, 2), UNK, dtype='int32' ) noise_batch = np.full( (self.batch_size * self.noise_ratio, 2), UNK, dtype='int32' ) return signal_batch, noise_batch def generate(self): chooser = TokenChooser(K=len(self.kernel)/2, kernel=self.kernel) signal_batch, noise_batch = self.init_batch() # i keeps track of position in the signal batch i = -1 for line in self.corpus_reader.read_no_q(): # Isolated tokens (e.g. one-word sentences) have no context # and can't be used for training. if len(line) < 2: continue token_ids = self.unigram_dictionary.get_ids(line) # We'll now generate generate signal examples and noise # examples for training for query_token_pos, query_token_id in enumerate(token_ids): # Possibly discard the token if self.do_discard(query_token_id): continue # Increment position within the batch i += 1 # Sample a token from the context context_token_pos = chooser.choose_token( query_token_pos, len(token_ids) ) context_token_id = token_ids[context_token_pos] signal_batch[i, :] = [query_token_id, context_token_id] # Sample tokens from the noise noise_context_ids = self.unigram_dictionary.sample( (self.noise_ratio,)) # Figure out the position within the noise batch j = i*self.noise_ratio # block-assign the noise samples to the noise batch array noise_batch[j:j+self.noise_ratio, :] = [ [query_token_id, noise_context_id] for noise_context_id in noise_context_ids ] # Once we've finished assembling a minibatch, enqueue it # and start assemblin a new minibatch if i == self.batch_size - 1: yield (signal_batch, noise_batch) signal_batch, noise_batch = self.init_batch() i = -1 # Normally we'll have a partially filled minibatch after processing # the corpus. The elements in the batch that weren't overwritten # contain UNK tokens, which act as padding. Enqueue the partial # minibatch. if i >= 0: yield (signal_batch, noise_batch) def get_minibatches(self): ''' Reads through the entire corpus, generating all of the minibatches up front, storing them in memory as a list. Returns the list of minibatches. ''' minibatches = [] for signal_batch, noise_batch in self.generate(): minibatches.append((signal_batch, noise_batch)) return minibatches def enqueue_minibatches(self, minibatch_queue, send_pipe): ''' Reads through the minibatches, placing them on a queue as they are ready. This usually shouldn't be called directly, but is used when the MinibatchGenerator is treated as an iterator, e.g.: for signal, noise in my_minibatch_generator: do_something_with(signal, noise) It causes the minibatches to be prepared in a separate process using this function, placing them on a queue, while a generator construct pulls them off the queue as the client process requests them. This keeps minibatch preparation running in the background while the client process is busy processing previously yielded minibatches. ''' # Continuously iterate through the dataset, enqueing each # minibatch. The consumer will process minibatches from # the queue at it's own pace. for signal_batch, noise_batch in self.generate(): minibatch_queue.put((signal_batch, noise_batch)) # Notify parent process that iteration through the corpus is # complete (so it doesn't need to wait for more minibatches) send_pipe.send(self.DONE) def do_discard(self, token_id): ''' This function helps with downsampling of very common words. Returns true when the token should be discarded as a query word ''' probability = self.unigram_dictionary.get_probability(token_id) discard_probability = 1 - np.sqrt(self.t/probability) do_discard = np.random.uniform() < discard_probability #if do_discard: # print 'discarding', self.unigram_dictionary.get_token(token_id) return do_discard def next(self): status = self.NOT_DONE while status == self.NOT_DONE: try: return self.minibatches.get(timeout=0.1) except Empty: if self.recv_pipe.poll(): status = self.recv_pipe.recv() raise StopIteration
class QRNNLM(): def __init__(self, path, corpus_path=''): self.base_path = os.path.expanduser(path) self.models_path = os.path.join(self.base_path, "models") if corpus_path == '': corpus_path = os.path.join(self.base_path, "corpus") self.corpus_path = corpus_path self.filters = {'language' : 'english'} os.makedirs(self.base_path, exist_ok=True) os.makedirs(self.models_path, exist_ok=True) os.makedirs(self.corpus_path, exist_ok=True) self.cr = CorpusReader(corpus_path) def encode_docs(self, docs): """ Encodes a list of documents into the necessary format for the RNN Returns a tuple of vocabulary and encoded documents texts :: list of documents to prepare """ voc = {"<s>":0, "</s>":1} # mapping of words to encoding vlist = ["<s>", "</s>"] # vocabulary list edocs = [] # list of encoded documents for doc in docs: edoc = [] for word in doc: if word not in voc: voc[word] = len(vlist) vlist.append(word) edoc.append(voc[word]) if len(edoc) > 0: edoc.append(1) # end word edocs.append(edoc) return (vlist, edocs) def test_models(self): """ Interactively test the trained models by entering query terms, shows best 5 matches """ terms = input('Comma-separated list of query terms: ') termlist = [x.strip() for x in terms.split(',')] vmodels = list(self.query(termlist).items()) # find matching models vmodels.sort(key=lambda m : m[1], reverse=True) # sort if len(vmodels) == 0: print('No models found!') return bmodels = vmodels[:min(5,len(vmodels))] # best five or less bmodels = [(id2name(idx), p) for idx, p in bmodels] # get document paths for i, m in enumerate(bmodels): # show list of found models print(i+1, m[0].split('.')[2],'/t',m[1]) i = input('Press number of choice: ') fname = bmodels[int(i)-1][0] # get chosen file name path = os.path.join(self.corpus_path, fname) # whole path with open(path, 'r') as f: print(f.read()) # show file content def id2name(self, idx): """ Translates story/model id into file name of fan fiction document """ files = os.listdir(self.corpus_path) fname = [f for f in files if f.startswith('ffnet.'+str(idx))][0] return fname def query(self, terms): """ Query the trained models for terms Returns dictionary of terms and corresponding probability terms :: list of query terms """ # find only documents containing all terms using the index ids = set(self.index[terms[0]]) for term in terms[1:]: ids = ids.intersection(self.index[term]) # calculate probabilites for words in these models model_probs = {} for idx in ids: vlist, model = self.load(path, idx) dist = model.run([0]) prob = 0 for term in terms: pos = vlist.index(term) # position of word in output vector prob += dist[pos] # use addition for now (else: smoothing and product) model_probs[term] = prob return model_probs def create_index(self): """ Create index of terms and models they occur in """ index = {} modelfiles = os.listdir(self.models_path) for name in modelfiles: if name != 'index': vlist,m = self.load(path, name) for w in vlist: if w not in index: index[w] = [name] else: index[w].append(name) self.index = index def create_single_models(self, max_count=-1, print_progress=False): count = 0 max_count = max(max_count, -1) doc_count = self.cr.count_documents() if print_progress: print('Number of documents: %s' % str(doc_count)) for [text,meta] in self.cr.get_corpus_iterator(**self.filters): idx = meta['storyid'] p = self.train_single(5, 10, 1.2, idx, text) if print_progress: print('Trained and saved model on document no %s/%s' % (str(count+1), str(doc_count)), end='\r') utils.print_percent(count/doc_count) #print('\nid: %s' % id) count += 1 if count >= max_count: break ''' plt.figure(figsize=(20,15)) legends = [] for K in range(10,30,5): for a in [0.8,1.0,1.2]: p = train_singles(5, K, a, ids, texts) i, per = np.array(p).T plt.plot(i,per) legends.append(['K: '+str(K)+', a: '+str(a)]) plt.legend(legends) plt.savefig('plots.svg') ''' def train_single(self, I, K, a, name, text): ''' I: number of epochs K: size of hidden layer a: learning rate alha name: file/model name for saving text: text to train on ''' perplexities = [] # train single document model (vlist, docs) = self.encode_docs([text]) print(vlist) V = len(vlist) # input layer size model = r.RNNLM_BPTT(V, K) for i in range(I): perplexities.append([i,model.perplexity(docs)]) model.learn(docs, a) a = a * 0.95 + 0.01 perplexities.append([I,model.perplexity(docs)]) if self.models_path != '': self.save(self.models_path, [vlist, model], name) return perplexities def save(self, path, data, name): ''' path: save location data: the model to save name: filename to save under ''' with open(os.path.join(path,name), 'wb') as f: pickle.dump(data, f) f.close() def load(self, path, name): ''' path: location from which to load name: name of the file ''' with open(os.path.join(path,name), 'rb') as f: data = pickle.load(f) f.close() return data
from corpus_reader import CorpusReader batch_size = 100 neg_samples = 40 embedding_size = 200 window_size = 1 def init_weights(shape): init = tf.truncated_normal(shape, stddev = 0.1) return tf.Variable(init) def init_biases(shape): init = tf.constant(0.1, shape=shape) return tf.Variable(init) corpus = CorpusReader('data', window_size=window_size) vocabulary_size = corpus.build_dictionary() X_train = tf.placeholder(tf.int32, [batch_size]) y_train = tf.placeholder(tf.int32, [batch_size]) y = tf.reshape(y_train, [-1, 1]) embeddings = init_weights([vocabulary_size, embedding_size]) W = init_weights([vocabulary_size, embedding_size]) b = init_biases([vocabulary_size]) batch_embed = tf.nn.embedding_lookup(embeddings, X_train) loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(W, b, batch_embed, y, neg_samples, vocabulary_size)) train = tf.train.AdamOptimizer(1e-3).minimize(loss)
queries = [] if args.query: queries.append(args.query) if args.file: with open(args.file, 'r') as fin: queries.extend([line.strip().split('\t')[1] for line in fin]) result = {} queries = { query: collections.OrderedDict( itertools.islice(ranker.rank(query).items(), 10)) for query in queries } for filename in filenames: for pmid, document in CorpusReader(filename).items(): toremove = list() for query, scores in queries.items(): score = scores.pop(pmid, None) if score is not None: if len(scores) == 0: toremove.append(query) result_scores = result.setdefault(query, []) result_scores.append((document, score)) for query in toremove: queries.pop(query) if len(queries) == 0: break else: # Continues if the inner loop DIDN'T break! continue break
def index(self, corpus_reader : CorpusReader): for pmid, document in corpus_reader.items(): self.update(pmid, document) self.documents.add(pmid) if self.process.memory_percent() >= self.max_memory_usage: self.dispatch()
def Main(): while True: # Display menu options DisplayMenu() op = raw_input("\nOption > ") if not op.isdigit() and int(op) in [0,1,2,3,4,5,6]: print "Opcion invalida" continue op = int(op) if op == 0: # Exit break else: # Read the parameters parameter = parameters[op-1] name = parameters[op-1][0].split("/")[-1] corpus = CorpusReader( parameter[0], parameter[1], parameter[2], parameter[3], parameter[4], category_position=parameter[5], category_level=parameter[6], start=parameter[7], decoding=parameter[8], ) try: # Get reviews and shuffle them reviews = list(enumerate(corpus.get_opinions())) # TODO: Cambia por lectura de BD op = raw_input("\nInsert IDs separated by ',' or <intro> for pick up randomly > ") if op: # From indexes indexes = [int(i) for i in op.split(',')] indexes = set(indexes) # Ensure no duplicated indexes = list(indexes) # Transform left = len(indexes) else: # Randomly while not op.isdigit(): op = raw_input("How many? > ") left = int(op) indexes = range(len(reviews)) random.shuffle(indexes) indexes = indexes[:left] reviews = [(i,review) for (i,review) in reviews if i in indexes] result = [] # Tag every review while left != 0: # Start id,review = reviews[left-1] words = review.split(' ') total = len(words) cats = [' ' for _ in range(total)] # For each word annotate with (N) or (I) and give the possibility of back by pressing (B) cat = "" idx = 0 while True: # Display review DisplayReview(id,idx,total,words,cats) # Check end condition if idx == total: op = raw_input("\nDone. Proceed with the next review (left %i)? [y/n] > " % (left-1)) if op == 'y': break idx = idx - 1 if idx != 0 else 0 cats[idx] = ' ' continue # Ask for input tooltip = "\nTag with N(ormal) or I(nverted). " tooltip += "Enter A(bort), B(ack) or <intro> for " tooltip += "repeating last action (%s) > " % (cat.upper() if cat else "None") tag = raw_input(tooltip) if not tag and not cat: # Prevents parse empty cat print "Input a category first";raw_input() continue elif tag: cat = tag # Action from decision cat = cat.lower() if not cat or cat not in 'niba': print "Option",cat,"is not correct." ;raw_input() continue if cat == 'b': # Back idx = idx - 1 if idx != 0 else 0 cats[idx] = ' ' elif cat == 'a': op = raw_input("Are you sure you want to abort (left %i)? [y/n] > " % left) if op.lower() == 'y': raise Exception("Abort") else: # Associate the category cats[idx] = cat idx = idx + 1 # Save the result as two list: words and its respective category for each one result.append({ "id" : id+1, "from" : name, "annotation" : ' '.join(word.lower()+"/"+cat for word,cat in zip(words,cats)) }) # Update left -= 1 # View and save results if op == 0: continue ViewSave(result,name) except Exception as e: content = json.dumps(result,indent=4,ensure_ascii=False) error = "Corpus:%s, Review:%i, Description:%s Partial:%s" % (name,id,str(e),content) log(error) raw_input("Reason: %s\nEnter to cotinue..." % str(e))
def train_model4(classifier): path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token("yake") """ set the tokenizer and model parameters """ bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") model = Pipeline([ ('emojis', emo), ('tokenizer', token), ('union', FeatureUnion(transformer_list=[ ("vectorizer", bert_transformer), ("sentiment", sentiment), ])), ("classifier", classifier), ]) batch_size = 40 num_users = len(corpus_reader_train.subjects) count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): all_texts.append([ subject.posts for subject in corpus_reader_train.subjects[(batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[(batch_size * count):(batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) ''' num_users = len(corpus_reader_test.subjects) all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): all_texts.append([ subject.posts for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) ''' print("End of training") # Its important to use binary mode dbfile = open(f'model4_{classifier.__class__.__name__}.sav', 'wb') pickle.dump(model, dbfile) return model
N_COMPONENTS = parameters['N_COMPONENTS'] MODEL_PATH = parameters['MODEL_PATH'] NUM_OF_SAMPLES = parameters['NUM_OF_SAMPLES'] WINDOW_SIZE = parameters['WINDOW_SIZE'] TEST_FOLDER = parameters['TEST_FOLDER'] TEST_FILE = parameters['TEST_FILE'] MODE = parameters['MODE'] if __name__ == '__main__': reader = CorpusReader(DATA_PATH, FOLDER_NAME, NUM_OF_SAMPLES=NUM_OF_SAMPLES) todo_path = os.path.join('bin', FOLDER_NAME+'_todo.json') done_path = os.path.join('bin', FOLDER_NAME + '_done.json') if os.path.exists(todo_path) and os.path.exists(done_path): with open(todo_path, 'r') as todo_f: todo = json.load(todo_f) todo_list = todo['document_ids'] with open(done_path, 'r') as done_f: done = json.load(done_f) done_list = done['document_ids'] assert reader.documents_amount == len(todo_list), "Something wrong within the corpus, please delete 'bin' folder and re-run it." if len(todo_list) != len(done_list): build_elasticsearch(data_path=DATA_PATH, zipfile_name=FOLDER_NAME)