Пример #1
0
def indexit(tokenizer, filenames):
    indexer = Indexer(used_tokenizer)
    for filename in filenames:
        corpus_reader = CorpusReader(filename)
        indexer.index(corpus_reader)
    indexer.sort()
    return indexer
Пример #2
0
def main():
    args = parse_args()
    r = CorpusReader(accent_map=args.accents, filter_punct=args.filter_punct, lower=args.lower)
    featdict, labels = r.get_featdict_from_lines(stdin, window=args.window)
    vec = DictVectorizer()
    X = vec.fit_transform(featdict).toarray()
    y, label_d = convert_labels(labels)
    cnt = defaultdict(int)
#    for l in y:
#        cnt[label_d[l]] += 1
#    for k, v in cnt.iteritems():
#        print('{0} {1}'.format(k.encode('utf8'), v))
    #print label_d
    #print(vec.fit_transform(featdict).toarray())
    #print vec.get_feature_names()
    run_pipeline(X, y)
Пример #3
0
 def index(self, corpus_reader: CorpusReader):
     update = self.__index.update
     for pmid, document in corpus_reader.items():
         update(pmid, document)
         self.__documents.setdefault(pmid, 0)
         if self.__process.memory_percent() >= self.__max_memory_usage:
             self.__dispatch()
Пример #4
0
	def __init__(
		self,
		files=[],
		directories=[],
		skip=[],
		unigram_dictionary=None,
		noise_ratio=15,
		kernel=[1,2,3,4,5,5,4,3,2,1],
		t = 1.0e-5,
		batch_size = 1000,
		parse=default_parse,
		verbose=True
	):

		# Get a corpus reader
		self.corpus_reader = CorpusReader(
			files=files, directories=directories, skip=skip, parse=parse,
			verbose=verbose
		)

		# Load the unigram_dictionary
		if unigram_dictionary is not None:
			self.unigram_dictionary = unigram_dictionary
		else:
			self.unigram_dictionary = UnigramDictionary()

		self.noise_ratio = noise_ratio
		self.kernel = kernel
		self.t = t
		self.batch_size = batch_size

		# Validate the kernel.  It should reflect the relative 
		# frequencies of choosing tokens from a window of +/- K tokens
		# relative to a query token.  So it must have an even number of
		# entries
		if not len(self.kernel) % 2 == 0:
			raise ValueError(
				'kernel should reflect the relative frequencies of '
				'selecting a context token within +/- K of the query '
				'token, and so should have an equal number of entries '
				'defining frequencies to the left and right of the query '
				'token, and so should have an even number of entries.'
			)
Пример #5
0
 def __init__(self, path, corpus_path=''):
     self.base_path = os.path.expanduser(path)
     self.models_path = os.path.join(self.base_path, "models")
     if corpus_path == '':
         corpus_path = os.path.join(self.base_path, "corpus")
     self.corpus_path = corpus_path
     self.filters = {'language' : 'english'}
     os.makedirs(self.base_path, exist_ok=True)
     os.makedirs(self.models_path, exist_ok=True)
     os.makedirs(self.corpus_path, exist_ok=True)
     self.cr = CorpusReader(corpus_path)
Пример #6
0
def indexit(tokenizer,
            filenames,
            store_positions=False,
            calculate_tfidf=False,
            memory_usage=20):
    index = Index(tokenizer, store_positions)
    indexer = Indexer(index, 'index', max_memory_usage=memory_usage)
    for filename in filenames:
        indexer.index(CorpusReader(filename))
    indexer.merge(calculate_tfidf)
    return index
Пример #7
0
    def run(self):
        corpus_reader = CorpusReader(self.path)
        corpus_reader.load()
        analyser = SentimentIntensityAnalyzer()

        num_subs = len(corpus_reader.subjects)
        for i, sub in enumerate(corpus_reader.subjects):
            print(f"Number os subjects left : {num_subs - i}")
            for post in sub.posts:
                score = analyser.polarity_scores(str(post))
                s = score['compound']
                if abs(s) > self.threshold:
                    string = spplit(str(post))
                    for j in range(3):
                        for i in range(len(string) - j):
                            score_word = analyser.polarity_scores(' '.join(
                                string[i:(i + j)]))
                            word_compound = score_word['compound']
                            if abs(word_compound) > self.threshold:
                                if string[i] not in self.imp_words:
                                    self.imp_words.append(' '.join(
                                        string[i:(i + j)]))
Пример #8
0
def indexit(tokenizer,
            filenames,
            store_positions=False,
            calculate_tfidf=False,
            memory_usage=20):
    indexer = Indexer(tokenizer,
                      'indexer',
                      store_positions=store_positions,
                      max_memory_usage=memory_usage)
    for filename in filenames:
        corpus_reader = CorpusReader(filename)
        indexer.index(corpus_reader)
    indexer.merge(calculate_tfidf)
    return indexer
Пример #9
0
def main():
    path = os.path.join('..', '..', 'dataset', 'eRISK2020_T1_training_data',
                        'td')

    print("Creating Corpus Reader for training")
    corpus_reader_train = CorpusReader(path)
    corpus_reader_train.load()
    print("Corpus Reader for training created")

    path = os.path.join('..', '..', 'dataset', 'T1_test_data', 'td')
    gt_name = 'T1_erisk_golden_truth.txt'
    corpus_reader_test = CorpusReader(path, gt_name)
    corpus_reader_test.load()

    all_texts = [
        ''.join(map(lambda x: str(x), subject.posts))
        for subject in corpus_reader_train.subjects
    ]
    all_gt = [subject.gt for subject in corpus_reader_train.subjects]

    count_vectorizer = CountVectorizer(analyzer='word',
                                       token_pattern=r'\w+',
                                       ngram_range=(1, 2))
    bow = dict()
    bow["train"] = (count_vectorizer.fit_transform(all_texts), all_gt)

    lr_classifier = LogisticRegression(solver='liblinear')
    lr_classifier.fit(*bow["train"])

    matrix = Matrix(len(corpus_reader_test.subjects),
                    corpus_reader_test.subjects)
    args = {'matrix': matrix, 'vec': count_vectorizer, 'class': lr_classifier}

    matrix = run_simulation(args)

    print(matrix)

    # analyze results
    precision = measures.calc_precision(corpus_reader_test.subjects, matrix)
    recall = measures.calc_recall(corpus_reader_test.subjects, matrix)
    f1 = measures.calc_f1(precision, recall)
    ERDE = measures.calc_ERDE(corpus_reader_test.subjects, matrix)
Пример #10
0
from corpus_reader import CorpusReader
from preprocess import PreProcess
from tf_idf import TfIdf
from knn import KNN
from metrics import MetricsGenerator
from pprint import pprint as pp

if __name__ == '__main__':
  print('reading...')
  reader = CorpusReader()
  reader.run()
  
  parser = PreProcess()
  parsed_trainning_documents = {}
  print('processing...')
  for k, v in reader.train.items():
    parsed_trainning_documents[k] = parser.process(v)
  
  # Entrada para o tf-idf, devemos anotar os documentos com suas classes.
  # Receberá como entrada um array de tuplas: ([tokens], classe)
  parsed_trainning_documents_with_classes = []
  for k in parsed_trainning_documents.keys():
    parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]]
  
  # Execução tf-idf
  print('generating tf.idf...')
  tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes)
  tf_idf_calculator.run()
  
  # testa os parâmetros do knn: métrica de distância e valor de K
  for metric in ['cosine', 'euclid']:
Пример #11
0
def train6():

    with open("log.txt", 'w') as f:
        pass
    #path1 = os.path.join( '..', '..',  'dataset', 'eRISK2020_T1_training_data', 'train') 
    #path1 = os.path.join( '..', 'data', 'erisk-2021-t2', 'td') 
    path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data')
    path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1')
    print("Creating Corpus Reader for training")
    corpus_reader_train = CorpusReader(path1)
    corpus_reader_train.load()
    print("Corpus Reader for training created")
    corpus_reader_test = CorpusReader(path2)
    corpus_reader_test.load()
    print("Corpus Reader for testing created")

    emo = Emojis()
    token = Token()

    """ set the tokenizer and model parameters """
    #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    #bert_model = BertModel.from_pretrained("bert-base-uncased")
    bert_model = SentenceTransformer('paraphrase-mpnet-base-v2')
    #device = torch.device("cuda")


    #bert_model.to(device)
    
    # create the bert
    bert_transformer = BigBird(bert_model)


    sentiment = Sentiment()

    
    """ training the model """
    print("Initializing Training")
    #n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
    parameters = { 'classifier__n_estimators':[50, 100, 500, 1000], 'classifier__learning_rate' : [ 0.001, 0.01, 0.1, 1.0], 'classifier__max_depth' : [1, 3, 5, 10]}
    classifier = GradientBoostingClassifier()
    


    
    
    model = Pipeline(
    [
        ('emojis', emo),
        #('tokenizer', token), 
        ('union', FeatureUnion(transformer_list = [
            ("vectorizer", bert_transformer),
            ("sentiment", sentiment),
        ])),
        
    
        ("classifier", classifier),
    ]
    )
    
    clf = GridSearchCV(model,  parameters)
    
 
    
    batch_size = 40

    num_users = len(corpus_reader_train.subjects)
    #print(num_users)
    for j in range(50, 2000, 50):
        count = 0
        all_texts = list()
        all_gt = list()
        for i in range(0, num_users, batch_size):
            #print(i)
            
            all_texts.append([ subject.posts[0:j]  for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))]  ])
            all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))] ])
            count += 1

        print(all_gt[0])
        for i in range(len(all_texts)):
            clf.fit(all_texts[i], all_gt[i])

    
    num_users = len(corpus_reader_test.subjects)
    
    #print(num_users)
    for j in range(50, 2000, 50):
        all_texts = list()
        all_gt = list()
        count = 0
        for i in range(0, num_users, batch_size):
            print(i)
            all_texts.append([ subject.posts[0:j]  for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))]  ])
            all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ])
            count += 1

        
        for i in range(len(all_texts)):
            clf.fit(all_texts[i], all_gt[i])
    

    
    print("End of training")
    return clf
Пример #12
0
__author__ = 'rwechsler'

import gensim
import sys
import glob
from corpus_reader import CorpusReader

files = glob.glob(sys.argv[1])
outfile_name = sys.argv[2]

dataset = CorpusReader(files)

model = gensim.models.Word2Vec(dataset,
                               size=500,
                               window=5,
                               min_count=3,
                               negative=5,
                               workers=2)

model.save(outfile_name)
    """
    data_path_train = os.path.join(args.data_dir, "train.csv.pkl")
    data_path_test = os.path.join(args.data_dir, "test.csv.pkl")

    model_name = args.model_name
    emb_path = args.emb_path
    save_dir = args.save_dir

    logging.info("loading word emb")
    word2idx, embedding_matrix = GloveEmbeddings.get_embeddings_with_custom_tokens(
        path=emb_path, embedding_dim=dim)
    vocab_size = len(word2idx)
    logging.info("word emb loaded: {}".format(vocab_size))

    logging.info("loading dataset")
    X_train, Y_train, X_dev, Y_dev, X_test, _ = CorpusReader.get_question_pair_data(
        data_path_train, data_path_test)
    """
    trim the test set is desired
    """
    if args.size_test_set:
        X_test = X_test[:args.size_test_set]

    assert (len(X_train) == len(Y_train)), "Train data and label size mismatch"
    logging.info("train size: {}, test size: {}, dev size: {}".format(
        len(X_train), len(X_test), len(X_dev)))
    logging.info("loaded dataset")

    list_classes = ["0", "1"]
    model = QuestionPairDecomposableAttModelTF(
        v=vocab_size,
        d=dim,
Пример #14
0
    ("../../corpus/corpus_cine","*.xml","<body>(.*?)</body>","rank=\"(.*?)\"","FILE",
        "BEFORE",None,0,'utf8',{u'1': 0, u'2': 25, u'3': 50, u'4': 75, u'5': 100}),
    ("../../corpus/corpus_hoteles","*.xml","<coah:review>(.*?)</coah:review>","<coah:rank>(.*?)</coah:rank>","FILE",
        "BEFORE",None,0,'utf8',{u'1': 0, u'2': 25, u'3': 50, u'4': 75, u'5': 100}),
    ("../../corpus/corpus_prensa_uy","*.csv","\"(.*?)\",(?:TRUE|FALSE)",",(.*?)\\n","FILE",
        "AFTER",None,0,'utf8',{u'Neg': 0, u'Neu': 50, u'Pos': 100}),
    ("../../corpus/corpus_tweets","*.tsv","(.*?)\\t.*?\\n","(.*?\\t.*?)\\t","FILE",
        "BEFORE",None,1,'utf8',{u'3\t1': 10, u'3\t2': 20, u'2\t4': 90, u'2\t2': 70, u'2\t3': 60, u'4\t2': 30, u'2\t1': 80, 
                                u'5\t1': 40, u'1\t5': 50, u'1\t4': 30, u'4\t1': 50, u'1\t1': 40, u'1\t3': 60, u'1\t2': 70}),
    ("../../corpus/corpus_variado_sfu","*/*.txt","(.*)\s","(.*?)_","PATH",
        None,1,0,'utf8',{'no': 0, 'yes': 100})
]

# Read each corpus
from corpus_reader import CorpusReader
for parameter in parameters:
    reader = CorpusReader(
                    parameter[0],
                    parameter[1],
                    parameter[2],
                    parameter[3],
                    parameter[4],
                    category_position=parameter[5],
                    category_level=parameter[6],
                    start=parameter[7],
                    decoding=parameter[8],
                )
    fun = parameter[9]
    data = reader.get_data(lambda x:fun[x])
    
Пример #15
0
def get_input_option(prompt, options):
    res = input(prompt + " (" + "/".join(options) + ") ")
    while res not in options:
        res = input("pardon? (" + "/".join(options) + ") ")
    return res


if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(description='Corpus Filter')
    arg_parser.add_argument('corpus_file', help='path to the corpus file')
    arg_parser.add_argument('output_prefix', help='path to the output files')
    args = arg_parser.parse_args()

    print('\n - Filtering Corpus -\n')

    corpus = CorpusReader(args.corpus_file)
    file_output_pos = open(args.output_prefix + '.pos', 'w', encoding='utf8')
    file_output_neg = open(args.output_prefix + '.neg', 'w', encoding='utf8')
    file_output_fav = open(args.output_prefix + '.fav', 'w', encoding='utf8')

    for tweet in corpus.text_json():
        tweet = tweet.replace('\n', ' ')
        tweet = tweet.strip()
        print('"' + tweet + '"')
        action = get_input_option('sarcasm detected?', ['y', 'n', 'f', 'q'])
        if action == 'f':
            file_output_fav.write(tweet + '\n')
            action = get_input_option('faved, but is there sarcasm?',
                                      ['y', 'n', 'q'])
        if action == 'y':
            file_output_pos.write(tweet + '\n')
Пример #16
0
    #mod3 = pickle.load(open(MODEL3_NAME, 'rb'))
    #mod4 = pickle.load(open(MODEL4_NAME, 'rb'))
    #mod5 = pickle.load(open(MODEL5_NAME, 'rb'))
    #mod6 = pickle.load(open(MODEL6_NAME, 'rb'))
    #mod7 = pickle.load(open(MODEL7_NAME, 'rb'))
    #device = torch.device("cuda")
    #no_vader.to(device)
    
    

    path = os.path.join( '..', 'data', 'erisk-2021-t2') 
    #path = os.path.join( '..', '..',  'dataset', 'T1_test_data', 'test')

    gt_name = 'golden_truth.txt'

    corpus_reader_test = CorpusReader(path)
    corpus_reader_test.load()

    with open("file.txt", 'w') as f:
        for sub in corpus_reader_test.subjects:
            f.write("{} - {}\n".format(sub.id, sub.gt))

    filename = "RESULTS_TEST_more_model3_no_token_param.txt"

    #clean file
    with open(filename, 'w') as file:
        pass

    # find the greatest number of posts
    posts_max = max([ len(s.posts) for s in corpus_reader_test.subjects ])
    print(posts_max)
Пример #17
0
    def __init__(
            self,
            max_len=50,  # Maximum sentence length, same for questions, answers and reviews
            num_reviews=20,  # Number of review candidates for each QA pair
            selftest=False,
            if_only_top_ans=True,
            top_score_recorder=None,
            load_meta=True,
            load_vocab=True,
            load_qa=True,
            load_review=True,
            load_word_embedding=True):
        try:
            # if not selftest:
            #   filename = os.path.join(DATA_PATH, 'datautil.pickle')
            # else:
            #   filename = os.path.join(DATA_PATH, 'datautil-selftest.pickle')
            # logger.info('Loading stored data from {} ...'.format(filename))
            # with open(filename, 'rb') as f:
            #   tmp_dict = pickle.load(f)
            # self.__dict__.clear()
            # self.__dict__.update(tmp_dict)
            self.selftest = selftest
            if load_meta:
                self._load_meta()
            if load_vocab:
                self._load_vocab()
            if load_qa:
                self._load_qa()
            if load_review:
                self._load_review()
            if load_word_embedding:
                self._load_word_embedding()
        except IOError:
            logger.info('Stored data not found, preprocessing ...')
            self.selftest = selftest
            self.max_len = max_len
            self.num_reviews = num_reviews

            logger.info('Initializing CorpusReader ...')
            corpusreader = CorpusReader(
                maxline=SELF_TEST_MAX_LINE if selftest else -1,
                num_reviews=(5 * self.num_reviews),
                if_only_top_ans=if_only_top_ans,
                load_glove=False if selftest else True)
            self.vocab_size = corpusreader.vocab_size
            self.num_pos_tags = corpusreader.num_pos_tags
            self.embed_matrix = corpusreader.embed_matrix
            self.w_embed_size = corpusreader.w_embed_size
            self.word2id = corpusreader.word2id
            self.id2word = corpusreader.id2word
            self.id2freq = corpusreader.id2freq
            self.pos2id = corpusreader.pos2id
            self.id2pos = corpusreader.id2pos

            logger.info('Read corpus data and convert to arrays ...')
            data, review_data, asin2id = self._read_into_arrays(
                corpusreader=corpusreader, if_only_top_ans=if_only_top_ans)
            self.review_data = review_data
            del corpusreader
            del review_data
            gc.collect()

            logger.info('Calculate review IDF ...')
            self.review_idf = self._get_review_idf()

            logger.info('Splitting data into train, dev, test sets ...')
            self._train_idx, self._dev_idx, self._test_idx = [], [], []
            self._train_size, self._dev_size, self._test_size = 0, 0, 0
            self._data_split(data)
            del data
            gc.collect()

            # logger.info('Storing into {}...'.format(filename))
            # with open(filename, 'wb') as f:
            #   pickle.dump(self.__dict__, f)
            self._save_meta()
            self._save_vocab()
            self._save_qa()
            self._save_review()
            self._save_word_embedding()

        self._block_to_dense()
        self.top_score_recorder = top_score_recorder
        if self.top_score_recorder is not None:
            logger.info("Train with Pseudo Relevance Feedbacks")
        self._print_info()
Пример #18
0
def train_model1(classifier):

    path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training',
                         'eRISK2020_T1_training_data',
                         'eRISK2020_T1_training_data',
                         'eRISK2020_training_data')
    path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training',
                         'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data',
                         'T1')
    print("Creating Corpus Reader for training")
    corpus_reader_train = CorpusReader(path1)
    corpus_reader_train.load()
    print("Corpus Reader for training created")
    corpus_reader_test = CorpusReader(path2)
    corpus_reader_test.load()
    print("Corpus Reader for testing created")

    emo = Emojis()
    token = Token("normal")
    """ set the tokenizer and model parameters """
    #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    #bert_model = BertModel.from_pretrained("bert-base-uncased")
    bert_model = SentenceTransformer('paraphrase-mpnet-base-v2')
    #device = torch.device("cuda")

    #bert_model.to(device)

    # create the bert
    bert_transformer = BigBird(bert_model)

    sentiment = Sentiment()
    """ training the model """
    print("Initializing Training")
    #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True)
    #clf = CalibratedClassifierCV(classifier)
    #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True)
    #classifier = AdaBoostClassifier(learning_rate = 0.01, n_estimators = 100)

    #clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

    model = Pipeline([
        ('emojis', emo),
        ('tokenizer', token),
        (
            'union',
            FeatureUnion(transformer_list=[
                ("vectorizer", bert_transformer),
                #("sentiment", sentiment),
            ])),
        ("classifier", classifier),
    ])

    batch_size = 40

    num_users = len(corpus_reader_train.subjects)
    #print(num_users)
    for j in range(50, 2000, 50):
        count = 0
        all_texts = list()
        all_gt = list()
        for i in range(0, num_users, batch_size):
            #print(i)

            all_texts.append([
                subject.posts[0:j]
                for subject in corpus_reader_train.subjects[(
                    batch_size * count):(batch_size * (count + 1))]
            ])
            all_gt.append([
                subject.gt for subject in corpus_reader_train.subjects[(
                    batch_size * count):(batch_size * (count + 1))]
            ])
            count += 1

        print(all_gt[0])
        for i in range(len(all_texts)):
            model.fit(all_texts[i], all_gt[i])

    num_users = len(corpus_reader_test.subjects)

    #print(num_users)
    for j in range(50, 2000, 50):
        all_texts = list()
        all_gt = list()
        count = 0
        for i in range(0, num_users, batch_size):
            print(i)
            all_texts.append([
                subject.posts[0:j] for subject in corpus_reader_test.subjects[(
                    batch_size * count):(batch_size * (count + 1))]
            ])
            all_gt.append([
                subject.gt for subject in corpus_reader_test.subjects[(
                    batch_size * count):(batch_size * (count + 1))]
            ])
            count += 1

        for i in range(len(all_texts)):
            model.fit(all_texts[i], all_gt[i])

    print("End of training")

    # Its important to use binary mode
    dbfile = open(f'model1_{classifier.__class__.__name__}.sav', 'wb')
    pickle.dump(model, dbfile)
    return model
Пример #19
0
 def add_langauge(self, pattern, response_pattern, language=ENGLISH):
     self._response = response_pattern
     CorpusReader.add_langauge(self, pattern, language)
Пример #20
0
class MinibatchGenerator(object):

	NOT_DONE = 0
	DONE = 1

	def __init__(
		self,
		files=[],
		directories=[],
		skip=[],
		unigram_dictionary=None,
		noise_ratio=15,
		kernel=[1,2,3,4,5,5,4,3,2,1],
		t = 1.0e-5,
		batch_size = 1000,
		parse=default_parse,
		verbose=True
	):

		# Get a corpus reader
		self.corpus_reader = CorpusReader(
			files=files, directories=directories, skip=skip, parse=parse,
			verbose=verbose
		)

		# Load the unigram_dictionary
		if unigram_dictionary is not None:
			self.unigram_dictionary = unigram_dictionary
		else:
			self.unigram_dictionary = UnigramDictionary()

		self.noise_ratio = noise_ratio
		self.kernel = kernel
		self.t = t
		self.batch_size = batch_size

		# Validate the kernel.  It should reflect the relative 
		# frequencies of choosing tokens from a window of +/- K tokens
		# relative to a query token.  So it must have an even number of
		# entries
		if not len(self.kernel) % 2 == 0:
			raise ValueError(
				'kernel should reflect the relative frequencies of '
				'selecting a context token within +/- K of the query '
				'token, and so should have an equal number of entries '
				'defining frequencies to the left and right of the query '
				'token, and so should have an even number of entries.'
			)


	def get_vocab_size(self):
		'''
		Get the size of the vocabulary.  Only makes sense to call this
		after MinibatchGenerator.prepare() has been called, or if an
		existing (pre-filled) UnigramDictionary was loaded, since otherwise 
		it would just return 0.
		'''
		# Delegate to the underlying UnigramDictionary
		return len(self.unigram_dictionary)


	def load(self, directory):
		'''
		Load the unigram_dictionary whose files are stored in <directory>.
		'''
		# Delegate to the underlying UnigramDictionary
		self.unigram_dictionary.load(directory)

	
	def save(self, directory):
		'''
		Save the unigram_dictionary to <directory>.
		'''
		# Delegate to the underlying UnigramDictionary
		self.unigram_dictionary.save(directory)


	def check_access(self, savedir):

		savedir = os.path.abspath(savedir)
		path, dirname = os.path.split(savedir)

		# Make sure that the directory we want exists (make it if not)
		if not os.path.isdir(path):
			raise IOError('%s is not a directory or does not exist' % path)
		if not os.path.exists(savedir):
			os.mkdir(os.path)
		elif os.path.isfile(savedir):
			raise IOError('%s is a file. % savedir')

		# Make sure we can write to the file
		f = open(os.path.join(savedir, '.__test-w2v-access'), 'w')
		f.write('test')
		f.close
		os.remove(os.path.join(savedir, '.__test-w2v-access'))


	def prepare(self, savedir=None, min_frequency=None):
		'''
		Iterate over the entire corpus in order to build a 
		UnigramDictionary.  We need this because we need to sample
		from the unigram distribution in producing minibatches.
		Optionally prune all tokens that occur fewer than min_frequency
		times from dictionary.  Use min_frequency=None (the default) to
		specify no pruning.  Optionally save the dictionary to savedir 
		(this is done after pruning if pruning is requested).
		'''

		# Before doing anything, if we were requested to save the 
		# dictionary, make sure we'll be able to do that (fail fast)
		if savedir is not None:
			self.check_access(savedir)

		# Read through the corpus, building the UnigramDictionary
		for line in self.corpus_reader.read_no_q():
			self.unigram_dictionary.update(line)

		# Prune the dictionary, if requested to do so.
		if min_frequency is not None:
			self.unigram_dictionary.prune(min_frequency)

		# Save the dictionary, if requested to do so.
		if savedir is not None:
			self.save(savedir)


	def prune(self, min_frequency=5):
		'''
		Exposes the prune function for the underlying UnigramDictionary
		'''
		self.unigram_dictionary.prune(min_frequency)
  

	def __iter__(self):

		# Once iter is called, a subprocess will be started which
		# begins generating minibatches.  These accumulate in a queue
		# and iteration pulls from that queue.  That way, iteration
		# can begin as soon as the first minibatch is prepared, and 
		# later minibatches are prepared in the background while earlier
		# minibatches are used.  The idea is that this will keep the 
		# CPU(s) busy while training occurs on the GPU

		self.minibatches = Queue()
		self.recv_pipe, send_pipe = Pipe()

		# We'll fork a process to assemble minibatches, and return 
		# immediatetely so that minibatches can be used as they are 
		# constructed.
		#
		# Because we assemble the batches within a forked process, it's 
		# access to randomness doesn't alter the state of the parent's 
		# random number generator.  Multiple calls to this function
		# would produce the same set of random samples, which is not
		# desired.  We make a call to the numpy random number generator
		# to advance the parent's random number generator's state to avoid
		# this problem:
		np.random.uniform()

		minibatch_preparation = Process(
			target=self.enqueue_minibatches,
			args=(self.minibatches, send_pipe)
		)
		minibatch_preparation.start()

		return self


	def init_batch(self):
		# Initialize np.array's to store the minibatch data.  We know
		# how big the batch is ahead of time.  Initialize by filling
		# the arrays with UNK tokens.  Doing this means that, at the end
		# of the corpus, when we don't necessarily have a full minibatch,
		# the final minibatch is padded with UNK tokens in order to be
		# of the desired shape.  This has no effect on training, because
		# we don't care about the embedding of the UNK token
		signal_batch = np.full(
			(self.batch_size, 2), UNK, dtype='int32'
		)
		noise_batch = np.full(
			(self.batch_size * self.noise_ratio, 2), UNK, dtype='int32'
		)
		return signal_batch, noise_batch


	def generate(self):

		chooser = TokenChooser(K=len(self.kernel)/2, kernel=self.kernel)
		signal_batch, noise_batch = self.init_batch()

		# i keeps track of position in the signal batch
		i = -1
		for line in self.corpus_reader.read_no_q():

			# Isolated tokens (e.g. one-word sentences) have no context
			# and can't be used for training.
			if len(line) < 2:
				continue

			token_ids = self.unigram_dictionary.get_ids(line)

			# We'll now generate generate signal examples and noise
			# examples for training
			for query_token_pos, query_token_id in enumerate(token_ids):

				# Possibly discard the token
				if self.do_discard(query_token_id):
					continue

				# Increment position within the batch
				i += 1

				# Sample a token from the context
				context_token_pos = chooser.choose_token(
					query_token_pos, len(token_ids)
				)
				context_token_id = token_ids[context_token_pos]
				signal_batch[i, :] = [query_token_id, context_token_id]

				# Sample tokens from the noise
				noise_context_ids = self.unigram_dictionary.sample(
					(self.noise_ratio,))

				# Figure out the position within the noise batch
				j = i*self.noise_ratio

				# block-assign the noise samples to the noise batch array
				noise_batch[j:j+self.noise_ratio, :] = [
					[query_token_id, noise_context_id]
					for noise_context_id in noise_context_ids
				]

				# Once we've finished assembling a minibatch, enqueue it
				# and start assemblin a new minibatch
				if i == self.batch_size - 1:
					yield (signal_batch, noise_batch)
					signal_batch, noise_batch = self.init_batch()
					i = -1

		# Normally we'll have a partially filled minibatch after processing
		# the corpus.  The elements in the batch that weren't overwritten
		# contain UNK tokens, which act as padding.  Enqueue the partial
		# minibatch.
		if i >= 0:
			yield (signal_batch, noise_batch)


	def get_minibatches(self):
		'''
		Reads through the entire corpus, generating all of the minibatches
		up front, storing them in memory as a list.  Returns the list of
		minibatches.
		'''
		minibatches = []
		for signal_batch, noise_batch in self.generate():
			minibatches.append((signal_batch, noise_batch))

		return minibatches


	def enqueue_minibatches(self, minibatch_queue, send_pipe):

		'''
		Reads through the minibatches, placing them on a queue as they
		are ready.  This usually shouldn't be called directly, but 
		is used when the MinibatchGenerator is treated as an iterator, e.g.:

			for signal, noise in my_minibatch_generator:
				do_something_with(signal, noise)

		It causes the minibatches to be prepared in a separate process
		using this function, placing them on a queue, while a generator
		construct pulls them off the queue as the client process requests
		them.  This keeps minibatch preparation running in the background
		while the client process is busy processing previously yielded 
		minibatches.
		'''

		# Continuously iterate through the dataset, enqueing each
		# minibatch.  The consumer will process minibatches from
		# the queue at it's own pace.
		for signal_batch, noise_batch in self.generate():
			minibatch_queue.put((signal_batch, noise_batch))

		# Notify parent process that iteration through the corpus is
		# complete (so it doesn't need to wait for more minibatches)
		send_pipe.send(self.DONE)


	def do_discard(self, token_id):
		'''
		This function helps with downsampling of very common words.
		Returns true when the token should be discarded as a query word
		'''
		probability = self.unigram_dictionary.get_probability(token_id)
		discard_probability = 1 - np.sqrt(self.t/probability)
		do_discard = np.random.uniform() < discard_probability

		#if do_discard:
		#	print 'discarding', self.unigram_dictionary.get_token(token_id)

		return do_discard


	def next(self):
		status = self.NOT_DONE
		while status == self.NOT_DONE:
			try:
				return self.minibatches.get(timeout=0.1)
			except Empty:
				if self.recv_pipe.poll():
					status = self.recv_pipe.recv()

		raise StopIteration
Пример #21
0
class QRNNLM():
    def __init__(self, path, corpus_path=''):
        self.base_path = os.path.expanduser(path)
        self.models_path = os.path.join(self.base_path, "models")
        if corpus_path == '':
            corpus_path = os.path.join(self.base_path, "corpus")
        self.corpus_path = corpus_path
        self.filters = {'language' : 'english'}
        os.makedirs(self.base_path, exist_ok=True)
        os.makedirs(self.models_path, exist_ok=True)
        os.makedirs(self.corpus_path, exist_ok=True)
        self.cr = CorpusReader(corpus_path)


    def encode_docs(self, docs):
        """
        Encodes a list of documents into the necessary format for the RNN
        Returns a tuple of vocabulary and encoded documents
        texts :: list of documents to prepare
        """
        voc = {"<s>":0, "</s>":1} # mapping of words to encoding
        vlist = ["<s>", "</s>"] # vocabulary list
        edocs = [] # list of encoded documents
        for doc in docs:
            edoc = []
            for word in doc:
                if word not in voc:
                    voc[word] = len(vlist)
                    vlist.append(word)
                edoc.append(voc[word])
            if len(edoc) > 0:
                edoc.append(1) # end word
                edocs.append(edoc)
        return (vlist, edocs)
        

    def test_models(self):
        """
        Interactively test the trained models by entering query terms, shows best 5 matches
        """
        terms = input('Comma-separated list of query terms: ')
        termlist = [x.strip() for x in terms.split(',')]

        vmodels = list(self.query(termlist).items()) # find matching models
        vmodels.sort(key=lambda m : m[1], reverse=True) # sort
        if len(vmodels) == 0:
            print('No models found!')
            return

        bmodels = vmodels[:min(5,len(vmodels))] # best five or less
        bmodels = [(id2name(idx), p) for idx, p in bmodels] # get document paths

        for i, m in enumerate(bmodels): # show list of found models
            print(i+1, m[0].split('.')[2],'/t',m[1])

        i = input('Press number of choice: ')
        fname = bmodels[int(i)-1][0] # get chosen file name
        path = os.path.join(self.corpus_path, fname) # whole path

        with open(path, 'r') as f:
            print(f.read()) # show file content


    def id2name(self, idx):
        """
        Translates story/model id into file name of fan fiction document
        """
        files = os.listdir(self.corpus_path)
        fname = [f for f in files if f.startswith('ffnet.'+str(idx))][0]
        return fname


    def query(self, terms):
        """
        Query the trained models for terms
        Returns dictionary of terms and corresponding probability
        terms :: list of query terms
        """
        # find only documents containing all terms using the index
        ids = set(self.index[terms[0]])
        for term in terms[1:]:
            ids = ids.intersection(self.index[term])

        # calculate probabilites for words in these models
        model_probs = {}
        for idx in ids:
            vlist, model = self.load(path, idx)
            dist = model.run([0]) 
            prob = 0
            for term in terms:
                pos = vlist.index(term) # position of word in output vector
                prob += dist[pos] # use addition for now (else: smoothing and product)
            model_probs[term] = prob
        return model_probs


    def create_index(self):
        """
        Create index of terms and models they occur in
        """
        index = {}
        modelfiles = os.listdir(self.models_path)
        for name in modelfiles:
            if name != 'index':
                vlist,m = self.load(path, name)
                for w in vlist:
                    if w not in index:
                        index[w] = [name]
                    else:
                        index[w].append(name)
        self.index = index

    def create_single_models(self, max_count=-1, print_progress=False):
        count = 0
        max_count = max(max_count, -1)
        doc_count = self.cr.count_documents()
        if print_progress:
            print('Number of documents: %s' % str(doc_count))

        for [text,meta] in self.cr.get_corpus_iterator(**self.filters):
            idx = meta['storyid']
            p = self.train_single(5, 10, 1.2, idx, text)
            if print_progress:
                print('Trained and saved model on document no %s/%s' % (str(count+1), str(doc_count)), end='\r')
                utils.print_percent(count/doc_count)
            #print('\nid: %s' % id)
            count += 1
            if count >= max_count:
                break

        '''
        plt.figure(figsize=(20,15))
        legends = []
        for K in range(10,30,5):
            for a in [0.8,1.0,1.2]:
                p = train_singles(5, K, a, ids, texts)
                i, per = np.array(p).T
                plt.plot(i,per)
                legends.append(['K: '+str(K)+', a: '+str(a)])
        plt.legend(legends)
        plt.savefig('plots.svg')
        '''


    def train_single(self, I, K, a, name, text):
        '''
        I: number of epochs
        K: size of hidden layer
        a: learning rate alha
        name: file/model name for saving
        text: text to train on
        '''

        perplexities = []
        # train single document model
        (vlist, docs) = self.encode_docs([text])
        print(vlist)
        V = len(vlist) # input layer size

        model = r.RNNLM_BPTT(V, K)
        for i in range(I):
            perplexities.append([i,model.perplexity(docs)])
            model.learn(docs, a)
            a = a * 0.95 + 0.01
        perplexities.append([I,model.perplexity(docs)])

        if self.models_path != '':
            self.save(self.models_path, [vlist, model], name)
        return perplexities

    def save(self, path, data, name):
        '''
        path: save location
        data: the model to save
        name: filename to save under
        '''
        with open(os.path.join(path,name), 'wb') as f:
            pickle.dump(data, f)
        f.close()

    def load(self, path, name):
        '''
        path: location from  which to load
        name: name of the file
        '''
        with open(os.path.join(path,name), 'rb') as f:
            data = pickle.load(f)
        f.close()
        return data
Пример #22
0
 def add_langauge(self, pattern, response_pattern, language=ENGLISH):
     self._response = response_pattern
     CorpusReader.add_langauge(self, pattern, language)
Пример #23
0
from corpus_reader import CorpusReader

batch_size = 100
neg_samples = 40
embedding_size = 200
window_size = 1

def init_weights(shape):
    init = tf.truncated_normal(shape, stddev = 0.1)
    return tf.Variable(init)

def init_biases(shape):
    init = tf.constant(0.1, shape=shape)
    return tf.Variable(init)

corpus = CorpusReader('data', window_size=window_size)
vocabulary_size = corpus.build_dictionary()

X_train = tf.placeholder(tf.int32, [batch_size])
y_train = tf.placeholder(tf.int32, [batch_size])
y = tf.reshape(y_train, [-1, 1])

embeddings = init_weights([vocabulary_size, embedding_size])
W = init_weights([vocabulary_size, embedding_size])
b = init_biases([vocabulary_size])

batch_embed = tf.nn.embedding_lookup(embeddings, X_train)

loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(W, b, batch_embed, y, neg_samples, vocabulary_size))
train = tf.train.AdamOptimizer(1e-3).minimize(loss)
Пример #24
0
        queries = []
        if args.query:
            queries.append(args.query)
        if args.file:
            with open(args.file, 'r') as fin:
                queries.extend([line.strip().split('\t')[1] for line in fin])

        result = {}
        queries = {
            query: collections.OrderedDict(
                itertools.islice(ranker.rank(query).items(), 10))
            for query in queries
        }

        for filename in filenames:
            for pmid, document in CorpusReader(filename).items():
                toremove = list()
                for query, scores in queries.items():
                    score = scores.pop(pmid, None)
                    if score is not None:
                        if len(scores) == 0:
                            toremove.append(query)
                        result_scores = result.setdefault(query, [])
                        result_scores.append((document, score))
                for query in toremove:
                    queries.pop(query)
                if len(queries) == 0:
                    break
            else:  # Continues if the inner loop DIDN'T break!
                continue
            break
Пример #25
0
 def index(self, corpus_reader : CorpusReader):
     for pmid, document in corpus_reader.items():
         self.update(pmid, document)
         self.documents.add(pmid)
         if self.process.memory_percent() >= self.max_memory_usage:
             self.dispatch()
Пример #26
0
def Main():
    while True:
        # Display menu options
        DisplayMenu()
        op = raw_input("\nOption > ")
        if not op.isdigit() and int(op) in [0,1,2,3,4,5,6]:
            print "Opcion invalida"
            continue
        op = int(op)
        if op == 0:
            # Exit
            break
        else:
            # Read the parameters
            parameter = parameters[op-1]
            name = parameters[op-1][0].split("/")[-1]
            corpus = CorpusReader(
                parameter[0],
                parameter[1],
                parameter[2],
                parameter[3],
                parameter[4],
                category_position=parameter[5],
                category_level=parameter[6],
                start=parameter[7],
                decoding=parameter[8],
            )
            
            try:     
                # Get reviews and shuffle them 
                reviews = list(enumerate(corpus.get_opinions())) # TODO: Cambia por lectura de BD
                op = raw_input("\nInsert IDs separated by ',' or <intro> for pick up randomly > ")
                if op: # From indexes
                    indexes = [int(i) for i in op.split(',')]
                    indexes = set(indexes)  # Ensure no duplicated
                    indexes = list(indexes) # Transform
                    left = len(indexes)
                else: # Randomly
                    while not op.isdigit():
                        op = raw_input("How many? > ")
                    left = int(op)
                    indexes = range(len(reviews))
                    random.shuffle(indexes)
                indexes = indexes[:left]
                reviews = [(i,review) for (i,review) in reviews if i in indexes]
                result = []
                
                # Tag every review
                while left != 0:   
                                     
                    # Start
                    id,review = reviews[left-1]
                    words = review.split(' ')
                    total = len(words)
                    cats = ['  ' for _ in range(total)]
                    
                    # For each word annotate with (N) or (I) and give the possibility of back by pressing (B)
                    cat = ""
                    idx = 0
                    while True:
                        # Display review
                        DisplayReview(id,idx,total,words,cats)
                        
                        # Check end condition
                        if idx == total:
                            op = raw_input("\nDone. Proceed with the next review (left %i)? [y/n] > " % (left-1))
                            if op == 'y':
                                break
                            idx = idx - 1 if idx != 0 else 0
                            cats[idx] = '  '
                            continue
                        
                        # Ask for input
                        tooltip  = "\nTag with N(ormal) or I(nverted). "
                        tooltip += "Enter A(bort), B(ack) or <intro> for "
                        tooltip += "repeating last action (%s) > " % (cat.upper() if cat else "None")
                        tag = raw_input(tooltip)
                        
                        if not tag and not cat: # Prevents parse empty cat
                            print "Input a category first";raw_input()
                            continue
                        elif tag:
                            cat = tag
                        
                        # Action from decision
                        cat = cat.lower()
                        if not cat or cat not in 'niba':
                            print "Option",cat,"is not correct." ;raw_input()
                            continue
                        if cat == 'b': # Back
                            idx = idx - 1 if idx != 0 else 0
                            cats[idx] = '  '
                        elif cat == 'a':
                            op = raw_input("Are you sure you want to abort (left %i)? [y/n] > " % left)
                            if op.lower() == 'y': raise Exception("Abort")
                        else:
                            # Associate the category
                            cats[idx] = cat
                            idx = idx + 1
                            
                    # Save the result as two list: words and its respective category for each one 
                    result.append({
                        "id" : id+1,
                        "from" : name,
                        "annotation" : ' '.join(word.lower()+"/"+cat for word,cat in zip(words,cats))
                    })
                    
                    # Update
                    left -= 1
                       
                # View and save results
                if op == 0: continue
                ViewSave(result,name)
            
            except Exception as e:
                content = json.dumps(result,indent=4,ensure_ascii=False)
                error = "Corpus:%s, Review:%i, Description:%s Partial:%s" % (name,id,str(e),content)
                log(error)
                raw_input("Reason: %s\nEnter to cotinue..." % str(e))
Пример #27
0
def train_model4(classifier):

    path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training',
                         'eRISK2020_T1_training_data',
                         'eRISK2020_T1_training_data',
                         'eRISK2020_training_data')
    path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training',
                         'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data',
                         'T1')
    print("Creating Corpus Reader for training")
    corpus_reader_train = CorpusReader(path1)
    corpus_reader_train.load()
    print("Corpus Reader for training created")
    corpus_reader_test = CorpusReader(path2)
    corpus_reader_test.load()
    print("Corpus Reader for testing created")

    emo = Emojis()
    token = Token("yake")
    """ set the tokenizer and model parameters """

    bert_model = SentenceTransformer('paraphrase-mpnet-base-v2')

    # create the bert
    bert_transformer = BigBird(bert_model)

    sentiment = Sentiment()
    """ training the model """
    print("Initializing Training")

    model = Pipeline([
        ('emojis', emo),
        ('tokenizer', token),
        ('union',
         FeatureUnion(transformer_list=[
             ("vectorizer", bert_transformer),
             ("sentiment", sentiment),
         ])),
        ("classifier", classifier),
    ])

    batch_size = 40

    num_users = len(corpus_reader_train.subjects)

    count = 0
    all_texts = list()
    all_gt = list()
    for i in range(0, num_users, batch_size):

        all_texts.append([
            subject.posts
            for subject in corpus_reader_train.subjects[(batch_size *
                                                         count):(batch_size *
                                                                 (count + 1))]
        ])
        all_gt.append([
            subject.gt
            for subject in corpus_reader_train.subjects[(batch_size *
                                                         count):(batch_size *
                                                                 (count + 1))]
        ])
        count += 1

    for i in range(len(all_texts)):
        model.fit(all_texts[i], all_gt[i])
    '''
    num_users = len(corpus_reader_test.subjects)
    
    
    all_texts = list()
    all_gt = list()
    count = 0
    for i in range(0, num_users, batch_size):
        
        all_texts.append([ subject.posts  for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))]  ])
        all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ])
        count += 1

    
    for i in range(len(all_texts)):
        model.fit(all_texts[i], all_gt[i])
    
    '''

    print("End of training")

    # Its important to use binary mode
    dbfile = open(f'model4_{classifier.__class__.__name__}.sav', 'wb')
    pickle.dump(model, dbfile)
    return model
N_COMPONENTS = parameters['N_COMPONENTS']

MODEL_PATH = parameters['MODEL_PATH']

NUM_OF_SAMPLES = parameters['NUM_OF_SAMPLES']
WINDOW_SIZE = parameters['WINDOW_SIZE']

TEST_FOLDER = parameters['TEST_FOLDER']
TEST_FILE = parameters['TEST_FILE']

MODE = parameters['MODE']


if __name__ == '__main__':

    reader = CorpusReader(DATA_PATH, FOLDER_NAME, NUM_OF_SAMPLES=NUM_OF_SAMPLES)

    todo_path = os.path.join('bin', FOLDER_NAME+'_todo.json')
    done_path = os.path.join('bin', FOLDER_NAME + '_done.json')

    if os.path.exists(todo_path) and os.path.exists(done_path):
        with open(todo_path, 'r') as todo_f:
            todo = json.load(todo_f)
            todo_list = todo['document_ids']
        with open(done_path, 'r') as done_f:
            done = json.load(done_f)
            done_list = done['document_ids']

        assert reader.documents_amount == len(todo_list), "Something wrong within the corpus, please delete 'bin' folder and re-run it."
        if len(todo_list) != len(done_list):
            build_elasticsearch(data_path=DATA_PATH, zipfile_name=FOLDER_NAME)