Пример #1
0
def indexit(tokenizer, filenames):
    indexer = Indexer(used_tokenizer)
    for filename in filenames:
        corpus_reader = CorpusReader(filename)
        indexer.index(corpus_reader)
    indexer.sort()
    return indexer
Пример #2
0
def main():
    path = os.path.join('..', '..', 'dataset', 'eRISK2020_T1_training_data',
                        'td')

    print("Creating Corpus Reader for training")
    corpus_reader_train = CorpusReader(path)
    corpus_reader_train.load()
    print("Corpus Reader for training created")

    path = os.path.join('..', '..', 'dataset', 'T1_test_data', 'td')
    gt_name = 'T1_erisk_golden_truth.txt'
    corpus_reader_test = CorpusReader(path, gt_name)
    corpus_reader_test.load()

    all_texts = [
        ''.join(map(lambda x: str(x), subject.posts))
        for subject in corpus_reader_train.subjects
    ]
    all_gt = [subject.gt for subject in corpus_reader_train.subjects]

    count_vectorizer = CountVectorizer(analyzer='word',
                                       token_pattern=r'\w+',
                                       ngram_range=(1, 2))
    bow = dict()
    bow["train"] = (count_vectorizer.fit_transform(all_texts), all_gt)

    lr_classifier = LogisticRegression(solver='liblinear')
    lr_classifier.fit(*bow["train"])

    matrix = Matrix(len(corpus_reader_test.subjects),
                    corpus_reader_test.subjects)
    args = {'matrix': matrix, 'vec': count_vectorizer, 'class': lr_classifier}

    matrix = run_simulation(args)

    print(matrix)

    # analyze results
    precision = measures.calc_precision(corpus_reader_test.subjects, matrix)
    recall = measures.calc_recall(corpus_reader_test.subjects, matrix)
    f1 = measures.calc_f1(precision, recall)
    ERDE = measures.calc_ERDE(corpus_reader_test.subjects, matrix)
Пример #3
0
def indexit(tokenizer,
            filenames,
            store_positions=False,
            calculate_tfidf=False,
            memory_usage=20):
    index = Index(tokenizer, store_positions)
    indexer = Indexer(index, 'index', max_memory_usage=memory_usage)
    for filename in filenames:
        indexer.index(CorpusReader(filename))
    indexer.merge(calculate_tfidf)
    return index
Пример #4
0
def indexit(tokenizer,
            filenames,
            store_positions=False,
            calculate_tfidf=False,
            memory_usage=20):
    indexer = Indexer(tokenizer,
                      'indexer',
                      store_positions=store_positions,
                      max_memory_usage=memory_usage)
    for filename in filenames:
        corpus_reader = CorpusReader(filename)
        indexer.index(corpus_reader)
    indexer.merge(calculate_tfidf)
    return indexer
Пример #5
0
def main():
    args = parse_args()
    r = CorpusReader(accent_map=args.accents, filter_punct=args.filter_punct, lower=args.lower)
    featdict, labels = r.get_featdict_from_lines(stdin, window=args.window)
    vec = DictVectorizer()
    X = vec.fit_transform(featdict).toarray()
    y, label_d = convert_labels(labels)
    cnt = defaultdict(int)
#    for l in y:
#        cnt[label_d[l]] += 1
#    for k, v in cnt.iteritems():
#        print('{0} {1}'.format(k.encode('utf8'), v))
    #print label_d
    #print(vec.fit_transform(featdict).toarray())
    #print vec.get_feature_names()
    run_pipeline(X, y)
Пример #6
0
	def __init__(
		self,
		files=[],
		directories=[],
		skip=[],
		unigram_dictionary=None,
		noise_ratio=15,
		kernel=[1,2,3,4,5,5,4,3,2,1],
		t = 1.0e-5,
		batch_size = 1000,
		parse=default_parse,
		verbose=True
	):

		# Get a corpus reader
		self.corpus_reader = CorpusReader(
			files=files, directories=directories, skip=skip, parse=parse,
			verbose=verbose
		)

		# Load the unigram_dictionary
		if unigram_dictionary is not None:
			self.unigram_dictionary = unigram_dictionary
		else:
			self.unigram_dictionary = UnigramDictionary()

		self.noise_ratio = noise_ratio
		self.kernel = kernel
		self.t = t
		self.batch_size = batch_size

		# Validate the kernel.  It should reflect the relative 
		# frequencies of choosing tokens from a window of +/- K tokens
		# relative to a query token.  So it must have an even number of
		# entries
		if not len(self.kernel) % 2 == 0:
			raise ValueError(
				'kernel should reflect the relative frequencies of '
				'selecting a context token within +/- K of the query '
				'token, and so should have an equal number of entries '
				'defining frequencies to the left and right of the query '
				'token, and so should have an even number of entries.'
			)
Пример #7
0
    def run(self):
        corpus_reader = CorpusReader(self.path)
        corpus_reader.load()
        analyser = SentimentIntensityAnalyzer()

        num_subs = len(corpus_reader.subjects)
        for i, sub in enumerate(corpus_reader.subjects):
            print(f"Number os subjects left : {num_subs - i}")
            for post in sub.posts:
                score = analyser.polarity_scores(str(post))
                s = score['compound']
                if abs(s) > self.threshold:
                    string = spplit(str(post))
                    for j in range(3):
                        for i in range(len(string) - j):
                            score_word = analyser.polarity_scores(' '.join(
                                string[i:(i + j)]))
                            word_compound = score_word['compound']
                            if abs(word_compound) > self.threshold:
                                if string[i] not in self.imp_words:
                                    self.imp_words.append(' '.join(
                                        string[i:(i + j)]))
Пример #8
0
    #mod3 = pickle.load(open(MODEL3_NAME, 'rb'))
    #mod4 = pickle.load(open(MODEL4_NAME, 'rb'))
    #mod5 = pickle.load(open(MODEL5_NAME, 'rb'))
    #mod6 = pickle.load(open(MODEL6_NAME, 'rb'))
    #mod7 = pickle.load(open(MODEL7_NAME, 'rb'))
    #device = torch.device("cuda")
    #no_vader.to(device)
    
    

    path = os.path.join( '..', 'data', 'erisk-2021-t2') 
    #path = os.path.join( '..', '..',  'dataset', 'T1_test_data', 'test')

    gt_name = 'golden_truth.txt'

    corpus_reader_test = CorpusReader(path)
    corpus_reader_test.load()

    with open("file.txt", 'w') as f:
        for sub in corpus_reader_test.subjects:
            f.write("{} - {}\n".format(sub.id, sub.gt))

    filename = "RESULTS_TEST_more_model3_no_token_param.txt"

    #clean file
    with open(filename, 'w') as file:
        pass

    # find the greatest number of posts
    posts_max = max([ len(s.posts) for s in corpus_reader_test.subjects ])
    print(posts_max)
Пример #9
0
    def __init__(
            self,
            max_len=50,  # Maximum sentence length, same for questions, answers and reviews
            num_reviews=20,  # Number of review candidates for each QA pair
            selftest=False,
            if_only_top_ans=True,
            top_score_recorder=None,
            load_meta=True,
            load_vocab=True,
            load_qa=True,
            load_review=True,
            load_word_embedding=True):
        try:
            # if not selftest:
            #   filename = os.path.join(DATA_PATH, 'datautil.pickle')
            # else:
            #   filename = os.path.join(DATA_PATH, 'datautil-selftest.pickle')
            # logger.info('Loading stored data from {} ...'.format(filename))
            # with open(filename, 'rb') as f:
            #   tmp_dict = pickle.load(f)
            # self.__dict__.clear()
            # self.__dict__.update(tmp_dict)
            self.selftest = selftest
            if load_meta:
                self._load_meta()
            if load_vocab:
                self._load_vocab()
            if load_qa:
                self._load_qa()
            if load_review:
                self._load_review()
            if load_word_embedding:
                self._load_word_embedding()
        except IOError:
            logger.info('Stored data not found, preprocessing ...')
            self.selftest = selftest
            self.max_len = max_len
            self.num_reviews = num_reviews

            logger.info('Initializing CorpusReader ...')
            corpusreader = CorpusReader(
                maxline=SELF_TEST_MAX_LINE if selftest else -1,
                num_reviews=(5 * self.num_reviews),
                if_only_top_ans=if_only_top_ans,
                load_glove=False if selftest else True)
            self.vocab_size = corpusreader.vocab_size
            self.num_pos_tags = corpusreader.num_pos_tags
            self.embed_matrix = corpusreader.embed_matrix
            self.w_embed_size = corpusreader.w_embed_size
            self.word2id = corpusreader.word2id
            self.id2word = corpusreader.id2word
            self.id2freq = corpusreader.id2freq
            self.pos2id = corpusreader.pos2id
            self.id2pos = corpusreader.id2pos

            logger.info('Read corpus data and convert to arrays ...')
            data, review_data, asin2id = self._read_into_arrays(
                corpusreader=corpusreader, if_only_top_ans=if_only_top_ans)
            self.review_data = review_data
            del corpusreader
            del review_data
            gc.collect()

            logger.info('Calculate review IDF ...')
            self.review_idf = self._get_review_idf()

            logger.info('Splitting data into train, dev, test sets ...')
            self._train_idx, self._dev_idx, self._test_idx = [], [], []
            self._train_size, self._dev_size, self._test_size = 0, 0, 0
            self._data_split(data)
            del data
            gc.collect()

            # logger.info('Storing into {}...'.format(filename))
            # with open(filename, 'wb') as f:
            #   pickle.dump(self.__dict__, f)
            self._save_meta()
            self._save_vocab()
            self._save_qa()
            self._save_review()
            self._save_word_embedding()

        self._block_to_dense()
        self.top_score_recorder = top_score_recorder
        if self.top_score_recorder is not None:
            logger.info("Train with Pseudo Relevance Feedbacks")
        self._print_info()
Пример #10
0
from corpus_reader import CorpusReader
from preprocess import PreProcess
from tf_idf import TfIdf
from knn import KNN
from metrics import MetricsGenerator
from pprint import pprint as pp

if __name__ == '__main__':
  print('reading...')
  reader = CorpusReader()
  reader.run()
  
  parser = PreProcess()
  parsed_trainning_documents = {}
  print('processing...')
  for k, v in reader.train.items():
    parsed_trainning_documents[k] = parser.process(v)
  
  # Entrada para o tf-idf, devemos anotar os documentos com suas classes.
  # Receberá como entrada um array de tuplas: ([tokens], classe)
  parsed_trainning_documents_with_classes = []
  for k in parsed_trainning_documents.keys():
    parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]]
  
  # Execução tf-idf
  print('generating tf.idf...')
  tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes)
  tf_idf_calculator.run()
  
  # testa os parâmetros do knn: métrica de distância e valor de K
  for metric in ['cosine', 'euclid']:
Пример #11
0
__author__ = 'rwechsler'

import gensim
import sys
import glob
from corpus_reader import CorpusReader

files = glob.glob(sys.argv[1])
outfile_name = sys.argv[2]

dataset = CorpusReader(files)

model = gensim.models.Word2Vec(dataset,
                               size=500,
                               window=5,
                               min_count=3,
                               negative=5,
                               workers=2)

model.save(outfile_name)
Пример #12
0
def train6():

    with open("log.txt", 'w') as f:
        pass
    #path1 = os.path.join( '..', '..',  'dataset', 'eRISK2020_T1_training_data', 'train') 
    #path1 = os.path.join( '..', 'data', 'erisk-2021-t2', 'td') 
    path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data')
    path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1')
    print("Creating Corpus Reader for training")
    corpus_reader_train = CorpusReader(path1)
    corpus_reader_train.load()
    print("Corpus Reader for training created")
    corpus_reader_test = CorpusReader(path2)
    corpus_reader_test.load()
    print("Corpus Reader for testing created")

    emo = Emojis()
    token = Token()

    """ set the tokenizer and model parameters """
    #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    #bert_model = BertModel.from_pretrained("bert-base-uncased")
    bert_model = SentenceTransformer('paraphrase-mpnet-base-v2')
    #device = torch.device("cuda")


    #bert_model.to(device)
    
    # create the bert
    bert_transformer = BigBird(bert_model)


    sentiment = Sentiment()

    
    """ training the model """
    print("Initializing Training")
    #n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
    parameters = { 'classifier__n_estimators':[50, 100, 500, 1000], 'classifier__learning_rate' : [ 0.001, 0.01, 0.1, 1.0], 'classifier__max_depth' : [1, 3, 5, 10]}
    classifier = GradientBoostingClassifier()
    


    
    
    model = Pipeline(
    [
        ('emojis', emo),
        #('tokenizer', token), 
        ('union', FeatureUnion(transformer_list = [
            ("vectorizer", bert_transformer),
            ("sentiment", sentiment),
        ])),
        
    
        ("classifier", classifier),
    ]
    )
    
    clf = GridSearchCV(model,  parameters)
    
 
    
    batch_size = 40

    num_users = len(corpus_reader_train.subjects)
    #print(num_users)
    for j in range(50, 2000, 50):
        count = 0
        all_texts = list()
        all_gt = list()
        for i in range(0, num_users, batch_size):
            #print(i)
            
            all_texts.append([ subject.posts[0:j]  for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))]  ])
            all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))] ])
            count += 1

        print(all_gt[0])
        for i in range(len(all_texts)):
            clf.fit(all_texts[i], all_gt[i])

    
    num_users = len(corpus_reader_test.subjects)
    
    #print(num_users)
    for j in range(50, 2000, 50):
        all_texts = list()
        all_gt = list()
        count = 0
        for i in range(0, num_users, batch_size):
            print(i)
            all_texts.append([ subject.posts[0:j]  for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))]  ])
            all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ])
            count += 1

        
        for i in range(len(all_texts)):
            clf.fit(all_texts[i], all_gt[i])
    

    
    print("End of training")
    return clf
Пример #13
0
def get_input_option(prompt, options):
    res = input(prompt + " (" + "/".join(options) + ") ")
    while res not in options:
        res = input("pardon? (" + "/".join(options) + ") ")
    return res


if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(description='Corpus Filter')
    arg_parser.add_argument('corpus_file', help='path to the corpus file')
    arg_parser.add_argument('output_prefix', help='path to the output files')
    args = arg_parser.parse_args()

    print('\n - Filtering Corpus -\n')

    corpus = CorpusReader(args.corpus_file)
    file_output_pos = open(args.output_prefix + '.pos', 'w', encoding='utf8')
    file_output_neg = open(args.output_prefix + '.neg', 'w', encoding='utf8')
    file_output_fav = open(args.output_prefix + '.fav', 'w', encoding='utf8')

    for tweet in corpus.text_json():
        tweet = tweet.replace('\n', ' ')
        tweet = tweet.strip()
        print('"' + tweet + '"')
        action = get_input_option('sarcasm detected?', ['y', 'n', 'f', 'q'])
        if action == 'f':
            file_output_fav.write(tweet + '\n')
            action = get_input_option('faved, but is there sarcasm?',
                                      ['y', 'n', 'q'])
        if action == 'y':
            file_output_pos.write(tweet + '\n')
Пример #14
0
        queries = []
        if args.query:
            queries.append(args.query)
        if args.file:
            with open(args.file, 'r') as fin:
                queries.extend([line.strip().split('\t')[1] for line in fin])

        result = {}
        queries = {
            query: collections.OrderedDict(
                itertools.islice(ranker.rank(query).items(), 10))
            for query in queries
        }

        for filename in filenames:
            for pmid, document in CorpusReader(filename).items():
                toremove = list()
                for query, scores in queries.items():
                    score = scores.pop(pmid, None)
                    if score is not None:
                        if len(scores) == 0:
                            toremove.append(query)
                        result_scores = result.setdefault(query, [])
                        result_scores.append((document, score))
                for query in toremove:
                    queries.pop(query)
                if len(queries) == 0:
                    break
            else:  # Continues if the inner loop DIDN'T break!
                continue
            break
Пример #15
0
def Main():
    while True:
        # Display menu options
        DisplayMenu()
        op = raw_input("\nOption > ")
        if not op.isdigit() and int(op) in [0,1,2,3,4,5,6]:
            print "Opcion invalida"
            continue
        op = int(op)
        if op == 0:
            # Exit
            break
        else:
            # Read the parameters
            parameter = parameters[op-1]
            name = parameters[op-1][0].split("/")[-1]
            corpus = CorpusReader(
                parameter[0],
                parameter[1],
                parameter[2],
                parameter[3],
                parameter[4],
                category_position=parameter[5],
                category_level=parameter[6],
                start=parameter[7],
                decoding=parameter[8],
            )
            
            try:     
                # Get reviews and shuffle them 
                reviews = list(enumerate(corpus.get_opinions())) # TODO: Cambia por lectura de BD
                op = raw_input("\nInsert IDs separated by ',' or <intro> for pick up randomly > ")
                if op: # From indexes
                    indexes = [int(i) for i in op.split(',')]
                    indexes = set(indexes)  # Ensure no duplicated
                    indexes = list(indexes) # Transform
                    left = len(indexes)
                else: # Randomly
                    while not op.isdigit():
                        op = raw_input("How many? > ")
                    left = int(op)
                    indexes = range(len(reviews))
                    random.shuffle(indexes)
                indexes = indexes[:left]
                reviews = [(i,review) for (i,review) in reviews if i in indexes]
                result = []
                
                # Tag every review
                while left != 0:   
                                     
                    # Start
                    id,review = reviews[left-1]
                    words = review.split(' ')
                    total = len(words)
                    cats = ['  ' for _ in range(total)]
                    
                    # For each word annotate with (N) or (I) and give the possibility of back by pressing (B)
                    cat = ""
                    idx = 0
                    while True:
                        # Display review
                        DisplayReview(id,idx,total,words,cats)
                        
                        # Check end condition
                        if idx == total:
                            op = raw_input("\nDone. Proceed with the next review (left %i)? [y/n] > " % (left-1))
                            if op == 'y':
                                break
                            idx = idx - 1 if idx != 0 else 0
                            cats[idx] = '  '
                            continue
                        
                        # Ask for input
                        tooltip  = "\nTag with N(ormal) or I(nverted). "
                        tooltip += "Enter A(bort), B(ack) or <intro> for "
                        tooltip += "repeating last action (%s) > " % (cat.upper() if cat else "None")
                        tag = raw_input(tooltip)
                        
                        if not tag and not cat: # Prevents parse empty cat
                            print "Input a category first";raw_input()
                            continue
                        elif tag:
                            cat = tag
                        
                        # Action from decision
                        cat = cat.lower()
                        if not cat or cat not in 'niba':
                            print "Option",cat,"is not correct." ;raw_input()
                            continue
                        if cat == 'b': # Back
                            idx = idx - 1 if idx != 0 else 0
                            cats[idx] = '  '
                        elif cat == 'a':
                            op = raw_input("Are you sure you want to abort (left %i)? [y/n] > " % left)
                            if op.lower() == 'y': raise Exception("Abort")
                        else:
                            # Associate the category
                            cats[idx] = cat
                            idx = idx + 1
                            
                    # Save the result as two list: words and its respective category for each one 
                    result.append({
                        "id" : id+1,
                        "from" : name,
                        "annotation" : ' '.join(word.lower()+"/"+cat for word,cat in zip(words,cats))
                    })
                    
                    # Update
                    left -= 1
                       
                # View and save results
                if op == 0: continue
                ViewSave(result,name)
            
            except Exception as e:
                content = json.dumps(result,indent=4,ensure_ascii=False)
                error = "Corpus:%s, Review:%i, Description:%s Partial:%s" % (name,id,str(e),content)
                log(error)
                raw_input("Reason: %s\nEnter to cotinue..." % str(e))
Пример #16
0
def train_model1(classifier):

    path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training',
                         'eRISK2020_T1_training_data',
                         'eRISK2020_T1_training_data',
                         'eRISK2020_training_data')
    path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training',
                         'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data',
                         'T1')
    print("Creating Corpus Reader for training")
    corpus_reader_train = CorpusReader(path1)
    corpus_reader_train.load()
    print("Corpus Reader for training created")
    corpus_reader_test = CorpusReader(path2)
    corpus_reader_test.load()
    print("Corpus Reader for testing created")

    emo = Emojis()
    token = Token("normal")
    """ set the tokenizer and model parameters """
    #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    #bert_model = BertModel.from_pretrained("bert-base-uncased")
    bert_model = SentenceTransformer('paraphrase-mpnet-base-v2')
    #device = torch.device("cuda")

    #bert_model.to(device)

    # create the bert
    bert_transformer = BigBird(bert_model)

    sentiment = Sentiment()
    """ training the model """
    print("Initializing Training")
    #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True)
    #clf = CalibratedClassifierCV(classifier)
    #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True)
    #classifier = AdaBoostClassifier(learning_rate = 0.01, n_estimators = 100)

    #clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

    model = Pipeline([
        ('emojis', emo),
        ('tokenizer', token),
        (
            'union',
            FeatureUnion(transformer_list=[
                ("vectorizer", bert_transformer),
                #("sentiment", sentiment),
            ])),
        ("classifier", classifier),
    ])

    batch_size = 40

    num_users = len(corpus_reader_train.subjects)
    #print(num_users)
    for j in range(50, 2000, 50):
        count = 0
        all_texts = list()
        all_gt = list()
        for i in range(0, num_users, batch_size):
            #print(i)

            all_texts.append([
                subject.posts[0:j]
                for subject in corpus_reader_train.subjects[(
                    batch_size * count):(batch_size * (count + 1))]
            ])
            all_gt.append([
                subject.gt for subject in corpus_reader_train.subjects[(
                    batch_size * count):(batch_size * (count + 1))]
            ])
            count += 1

        print(all_gt[0])
        for i in range(len(all_texts)):
            model.fit(all_texts[i], all_gt[i])

    num_users = len(corpus_reader_test.subjects)

    #print(num_users)
    for j in range(50, 2000, 50):
        all_texts = list()
        all_gt = list()
        count = 0
        for i in range(0, num_users, batch_size):
            print(i)
            all_texts.append([
                subject.posts[0:j] for subject in corpus_reader_test.subjects[(
                    batch_size * count):(batch_size * (count + 1))]
            ])
            all_gt.append([
                subject.gt for subject in corpus_reader_test.subjects[(
                    batch_size * count):(batch_size * (count + 1))]
            ])
            count += 1

        for i in range(len(all_texts)):
            model.fit(all_texts[i], all_gt[i])

    print("End of training")

    # Its important to use binary mode
    dbfile = open(f'model1_{classifier.__class__.__name__}.sav', 'wb')
    pickle.dump(model, dbfile)
    return model
Пример #17
0
    ("../../corpus/corpus_cine","*.xml","<body>(.*?)</body>","rank=\"(.*?)\"","FILE",
        "BEFORE",None,0,'utf8',{u'1': 0, u'2': 25, u'3': 50, u'4': 75, u'5': 100}),
    ("../../corpus/corpus_hoteles","*.xml","<coah:review>(.*?)</coah:review>","<coah:rank>(.*?)</coah:rank>","FILE",
        "BEFORE",None,0,'utf8',{u'1': 0, u'2': 25, u'3': 50, u'4': 75, u'5': 100}),
    ("../../corpus/corpus_prensa_uy","*.csv","\"(.*?)\",(?:TRUE|FALSE)",",(.*?)\\n","FILE",
        "AFTER",None,0,'utf8',{u'Neg': 0, u'Neu': 50, u'Pos': 100}),
    ("../../corpus/corpus_tweets","*.tsv","(.*?)\\t.*?\\n","(.*?\\t.*?)\\t","FILE",
        "BEFORE",None,1,'utf8',{u'3\t1': 10, u'3\t2': 20, u'2\t4': 90, u'2\t2': 70, u'2\t3': 60, u'4\t2': 30, u'2\t1': 80, 
                                u'5\t1': 40, u'1\t5': 50, u'1\t4': 30, u'4\t1': 50, u'1\t1': 40, u'1\t3': 60, u'1\t2': 70}),
    ("../../corpus/corpus_variado_sfu","*/*.txt","(.*)\s","(.*?)_","PATH",
        None,1,0,'utf8',{'no': 0, 'yes': 100})
]

# Read each corpus
from corpus_reader import CorpusReader
for parameter in parameters:
    reader = CorpusReader(
                    parameter[0],
                    parameter[1],
                    parameter[2],
                    parameter[3],
                    parameter[4],
                    category_position=parameter[5],
                    category_level=parameter[6],
                    start=parameter[7],
                    decoding=parameter[8],
                )
    fun = parameter[9]
    data = reader.get_data(lambda x:fun[x])
    
Пример #18
0
def train_model4(classifier):

    path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training',
                         'eRISK2020_T1_training_data',
                         'eRISK2020_T1_training_data',
                         'eRISK2020_training_data')
    path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training',
                         'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data',
                         'T1')
    print("Creating Corpus Reader for training")
    corpus_reader_train = CorpusReader(path1)
    corpus_reader_train.load()
    print("Corpus Reader for training created")
    corpus_reader_test = CorpusReader(path2)
    corpus_reader_test.load()
    print("Corpus Reader for testing created")

    emo = Emojis()
    token = Token("yake")
    """ set the tokenizer and model parameters """

    bert_model = SentenceTransformer('paraphrase-mpnet-base-v2')

    # create the bert
    bert_transformer = BigBird(bert_model)

    sentiment = Sentiment()
    """ training the model """
    print("Initializing Training")

    model = Pipeline([
        ('emojis', emo),
        ('tokenizer', token),
        ('union',
         FeatureUnion(transformer_list=[
             ("vectorizer", bert_transformer),
             ("sentiment", sentiment),
         ])),
        ("classifier", classifier),
    ])

    batch_size = 40

    num_users = len(corpus_reader_train.subjects)

    count = 0
    all_texts = list()
    all_gt = list()
    for i in range(0, num_users, batch_size):

        all_texts.append([
            subject.posts
            for subject in corpus_reader_train.subjects[(batch_size *
                                                         count):(batch_size *
                                                                 (count + 1))]
        ])
        all_gt.append([
            subject.gt
            for subject in corpus_reader_train.subjects[(batch_size *
                                                         count):(batch_size *
                                                                 (count + 1))]
        ])
        count += 1

    for i in range(len(all_texts)):
        model.fit(all_texts[i], all_gt[i])
    '''
    num_users = len(corpus_reader_test.subjects)
    
    
    all_texts = list()
    all_gt = list()
    count = 0
    for i in range(0, num_users, batch_size):
        
        all_texts.append([ subject.posts  for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))]  ])
        all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ])
        count += 1

    
    for i in range(len(all_texts)):
        model.fit(all_texts[i], all_gt[i])
    
    '''

    print("End of training")

    # Its important to use binary mode
    dbfile = open(f'model4_{classifier.__class__.__name__}.sav', 'wb')
    pickle.dump(model, dbfile)
    return model
N_COMPONENTS = parameters['N_COMPONENTS']

MODEL_PATH = parameters['MODEL_PATH']

NUM_OF_SAMPLES = parameters['NUM_OF_SAMPLES']
WINDOW_SIZE = parameters['WINDOW_SIZE']

TEST_FOLDER = parameters['TEST_FOLDER']
TEST_FILE = parameters['TEST_FILE']

MODE = parameters['MODE']


if __name__ == '__main__':

    reader = CorpusReader(DATA_PATH, FOLDER_NAME, NUM_OF_SAMPLES=NUM_OF_SAMPLES)

    todo_path = os.path.join('bin', FOLDER_NAME+'_todo.json')
    done_path = os.path.join('bin', FOLDER_NAME + '_done.json')

    if os.path.exists(todo_path) and os.path.exists(done_path):
        with open(todo_path, 'r') as todo_f:
            todo = json.load(todo_f)
            todo_list = todo['document_ids']
        with open(done_path, 'r') as done_f:
            done = json.load(done_f)
            done_list = done['document_ids']

        assert reader.documents_amount == len(todo_list), "Something wrong within the corpus, please delete 'bin' folder and re-run it."
        if len(todo_list) != len(done_list):
            build_elasticsearch(data_path=DATA_PATH, zipfile_name=FOLDER_NAME)