def __init__(self, languge): self.settings = Settings() self.language = languge self.text_processor = TextProcessor(self.language) self.signalwords = self.settings.signal_words_ru
def process(self, text, doc_id): ''' calls on the TextProcessor class for text processing and tokenization of documents. each url is designated with unique id number. id is increased at the end for new url. returns updated index. ''' processor = TextProcessor(text, self.index, doc_id) processor.process_text() return processor.get_index()
class InvertedIndex: def __init__(self, fpath, dump_fpath): self.text_processor = TextProcessor() self.queries = [] self.documents = [] self.original_documents = [] self.is_duplicates = [] self.vectorizer = CountVectorizer() self.build_index(fpath) self.dump(dump_fpath) def build_index(self, fpath): with open(abspath(fpath), 'r', encoding='utf-8') as file: table = csv_reader(file) for row in tqdm(list(table)): if row[0] == '': continue # TODO: сделать полный индекс if row[0] == '10000': break self.queries.append(self.text_processor.process(row[1])) self.documents.append(self.text_processor.process(row[2])) self.original_documents.append(row[2]) self.is_duplicates.append(row[3]) self.vectorizer.fit_transform(self.queries) def dump(self, dump_fpath): json_encoded = jp_encode(self) with open(dump_fpath, 'w', encoding='utf-8') as file: json_dump(json_encoded, file, ensure_ascii=False, indent=4) @staticmethod def restore(dump_fpath): with open(dump_fpath, "r", encoding='utf-8') as file: idx_dump = json_load(file) return jp_decode(idx_dump) @staticmethod def from_dump_or_build(dump_fpath, corpora_fpath): if isfile(dump_fpath): try: return InvertedIndex.restore(dump_fpath) except Exception: return InvertedIndex(corpora_fpath, dump_fpath) else: return InvertedIndex(corpora_fpath, dump_fpath)
def __init__(self, fpath, dump_fpath): self.text_processor = TextProcessor() self.queries = [] self.documents = [] self.original_documents = [] self.is_duplicates = [] self.vectorizer = CountVectorizer() self.build_index(fpath) self.dump(dump_fpath)
class PtBrTwitter(): def __init__(self, dir_in, dir_out): self.dir_in = dir_in self.dir_out = dir_out self.tw_files = ([file for root, dirs, files in os.walk(self.dir_in) for file in files if file.endswith('.json') ]) self.doc_list = list() self.date_list = list() self.tp = TextProcessor() def read(self): for tw_file in self.tw_files: with open(self.dir_in+tw_file) as data_file: for line in data_file: tweet = json.loads(line) self.doc_list.append(tweet['text']) self.date_list.append(tweet['created_at']) def tokenizeAndSave(self, file_name): tweets = self.tp.text_process(self.doc_list) tweets = list(itertools.chain.from_iterable(tweets)) t_count = Counter(tweets) with open(self.dir_out+file_name, 'wb') as handle: pickle.dump(t_count, handle) def loadCounter(self, file_name): with open(self.dir_out+file_name, 'rb') as handle: t_count = pickle.load(handle) return t_count
def GenerateLocalizableFile(self): """Generates a localizable representation of the article. This method grabs the English version of the article, runs the text through `TextProcessor`, and writes the result to `localizableFilePath`. Returns: An absolute path to the article's localizable representation. Raises: ArticleException: If no English version of an article is available. """ if not 'en' in self.locales: error = """ ArticleException: - path: %s - locales: %s - No English edition found. """ raise ArticleException(error % (self.path, self.locales)) original = self.GetOriginalFilePath('en') with codecs.open(original, 'r', 'UTF-8') as infile: with codecs.open(self.localizable_file_path, 'w', 'UTF-8') as output: temp = TextProcessor(django=infile.read()) output.write(temp.html) return self.localizable_file_path
def run(self): logger.info('Creating text processor') text_processor = TextProcessor() for file in self.input().keys(): logger.info('Reading %s file: "%s"', file, self.input()[file].path) df = pd.read_csv(self.input()[file].path) logger.info('Its %s lines', df.shape[0]) logger.info('Start processing %s...', file) df.name = df.name.map(lambda x: text_processor.process_text(x, lang='ru')) df.name = df.name.map(lambda x: ' '.join(x)) logger.info('Processing of %s succeed, writing it to "%s"', file, self.output()[file].path) df.to_csv(self.output()[file].path)
def __init__(self, dir_in, dir_out): self.dir_in = dir_in self.dir_out = dir_out self.tw_files = ([file for root, dirs, files in os.walk(self.dir_in) for file in files if file.endswith('.json') ]) self.doc_list = list() self.date_list = list() self.tp = TextProcessor()
def __init__(self, text = ''): # prepare to use nltk_data path_to_nltk_data = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'nltk_data' ) nltk.data.path.append(path_to_nltk_data) # pre-processing tp = TextProcessor(text) self.text = tp.getProcessedText() # setup dictionaries corpus = Corpus() words = {} words['positive'] = corpus.positiveWordDict() words['negative'] = corpus.negativeWordDict() self.dictionaries = words
def get_article(self, url): # инициализируем UrlHandler urlhandler = UrlHandler() # получаем веб-страницу и её кодировку source_page, encoding = urlhandler.load_page(url) # инициализируем парсер, текстовый процессор, экстрактор html_parser = Parser(source_page, encoding) text_processor = TextProcessor(self.language) article_extractor = ArticleExtractor(self.language) formatter = Formatter() # получаем списки элементов, очищенных от тегов (raw_cleaned_elements) # и нет (elements_as_string) raw_cleaned_elements, elements_as_string = html_parser.get_parsed_nodes( ) # заголовок title = html_parser.get_title() # получаем спосок лемматизированных текстов stemmed_tag_elements = text_processor.iterate_over_texts( raw_cleaned_elements) # получаем ранжированный список элементов best_nodes = article_extractor.find_best_node(stemmed_tag_elements) # для первого элемента из ранжированного списка # ищем в цикле нужный элемент с тегами (elements_as_string) # передаем найденный элемент в out_formatter for text, element in zip(raw_cleaned_elements, elements_as_string): if best_nodes[0][0] == text: node_to_format = element # out_formatter подготавливает текст для сохранения clean_text = formatter.format_article(node_to_format) # сохраняем в текстовый файл with codecs.open('output.txt', 'w', 'utf-8') as out: out.write(title + '\n\n') for paragraph in clean_text: for line in paragraph: out.write(line) out.write('\n') out.write('\n')
def process_files(self): files=[f for f in listdir(self.path)] files_dic={} for file in files: #process the file based on the file extension file_ext=file.split('.')[-1] if file_ext=='txt': files_dic[file]=self.process_txt(self.path+file) elif file_ext=='pdf': files_dic[file]=self.process_pdf(self.path+file) elif file_ext=='html': files_dic[file]=self.process_html(self.path+file) tp=TextProcessor() for file, text in files_dic.items(): #call the text_processor module text_proc_result=tp.process(JSONEncoder().encode({'action':'process', 'data':text})) text_proc_result=JSONDecoder().decode(text_proc_result)['terms'] files_dic[file]=text_proc_result return files_dic
def __getitem__(self, idx): textprocessor = TextProcessor(VOCAB_SIZE=self.VOCAB_SIZE) utils = Utils() video_file = self.train_dir_list[ idx] # get video file corresponding to the id, idx output_text = self.utils.output_text( self.train_corpus, video_file) # get the text contained in the video file #### generate input 2, from the output_text sentence_to_index = textprocessor.sentence_to_indices( utils.tagger_input(utils.clean_text(output_text)), self.word_to_index) X_2 = textprocessor.get_output(sentence_to_index, self.NUMBER_OF_WORDS) #### generate output, from the output_text sentence_to_index = textprocessor.sentence_to_indices( utils.tagger_output(utils.clean_text(output_text)), self.word_to_index) y = textprocessor.get_output(sentence_to_index, self.NUMBER_OF_WORDS) video_path = self.train_dir + video_file # generate input 1 X_1 = utils.video_to_frames(video_path, self.number_of_frames, self.device, self.INPUT_SIZE, self.model, self.transform) #X_1 = pre_data[idx] return (X_1, torch.tensor(X_2)), torch.tensor(y)
def main(): args = parser.parse_args() with open(args.classifier, 'r') as f: serialized_classifier = f.read() processor = TextProcessor() classifier = Classifier.load(serialized_classifier, processor) for example in read_dataset(args.data): text = example['content'] predicted_tag = classifier.classify(text) print predicted_tag
def main(): args = parser.parse_args() data_json = read_dataset(args.data) processor = TextProcessor() classifier = Classifier(processor) classifier.train(data_json) serialized_classifier = classifier.dump() ensure_directory(args.output) with open(args.output, 'w') as f: f.write(serialized_classifier) f.write(os.linesep)
def main(): args = parser.parse_args() data_json = read_dataset(args.data) random.shuffle(data_json) training_set_ratio = 0.7 training_set_size = int(training_set_ratio * len(data_json) + 0.5) training_set = data_json[:training_set_size] test_set = data_json[training_set_size:] processor = TextProcessor() classifier = Classifier(processor) classifier.train(training_set) print classifier.dump() == Classifier.load(classifier.dump(), processor).dump()
def response(): link = request.form['link'] if TextProcessor.validate_link(link): git_history = GitProcessor.get_history(repo_link=link, num_of_commits=INITIAL_COMMITS) if len(git_history) > 0: start_default_value = git_history[0][0] end_default_value = git_history[len(git_history) - 1][0] else: start_default_value = "" end_default_value = "" return render_template(template_name_or_list="report_generator.html", history=git_history, repo_link=link, start_default_value=start_default_value, end_default_value=end_default_value) else: return render_template("main.html", text="Invalid Git Repository Link")
def ImportLocalizedFiles(self): """If new localized files are available, import them. This method sifts through the available localized files, and imports each to the correct location under CONTENT_ROOT. """ for locale in self.new_localizations: if os.path.isfile(self.GetLocalizedFilePath(locale)): try: os.mkdir(os.path.dirname(self.GetOriginalFilePath(locale))) except OSError: pass in_path = self.GetLocalizedFilePath(locale) out_path = self.GetOriginalFilePath(locale) with codecs.open(in_path, 'r', 'UTF-8') as infile: with codecs.open(out_path, 'w', 'UTF-8') as outfile: temp = TextProcessor(html=infile.read()) outfile.write(temp.django) self.__available_localizations = [] self.__locales = []
class TestIsWord: tp = TextProcessor() def test_one_word_latin(self): assert self.tp.is_word('Test') def test_one_word_cyr(self): assert self.tp.is_word('Тест') def test_two_words(self): assert not self.tp.is_word('Test test') def test_word_with_num(self): assert self.tp.is_word('Test1') def test_word_with_underscore(self): assert self.tp.is_word('Test_test') def test_words_devided_by_symbol(self): assert not self.tp.is_word('Test*test')
def main(): args = parser.parse_args() full_data_json = read_dataset(args.data) # for n in xrange(30, len(full_data_json), 30): for n in [len(full_data_json)]: corrects = 0 total = 0 for _ in xrange(SAMPLES): random.shuffle(full_data_json) data_json = full_data_json[:n] training_set_ratio = 0.7 training_set_size = int(training_set_ratio * len(data_json) + 0.5) training_set = data_json[:training_set_size] test_set = data_json[training_set_size:] processor = TextProcessor() classifier = Classifier(processor) classifier.train(training_set) for example in test_set: text = example['content'] predicted_tag = classifier.classify(text) expected_tag = classifier.normalize_tag_label(example['tag']) if expected_tag in Classifier.IGNORE_TAGS: continue if predicted_tag == expected_tag: corrects += 1 else: # print 'expected = {}, predicted = {}'.format(expected_tag, predicted_tag) pass total += 1 print '{} {}'.format(len(data_json), float(corrects) / total)
path = dict(cf.items("file_path")) dir_w2v = path['dir_w2v'] print('Loading word2vec model...') word2vec_model = gensim.models.KeyedVectors.load_word2vec_format( dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore") texts, y_true = load_validation_file_csv(VALIDATION_FILE) print('Loading ' + MODEL_FILE + ' file...') model = joblib.load(MODEL_FILE) pol = '' n_pol = '' y_pred = list() tp = TextProcessor() texts = tp.text_process(texts, text_only=True) X = gen_data(texts) mean_auc, std_auc = generate_roc_curve( model, X, y_true, MODEL_FILE, get_model_name_by_file(VALIDATION_FILE)) print('Predicting...') y_pred = model.predict(X) print('Classification Report') print(classification_report(y_true, y_pred)) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) model_name = MODEL_FILE.replace(SKL_FOLDER, '')
file = "/Users/lucasso 1/Documents/validation/nao_politicos.json" data = { 'favorites': [], 'user_id': [], 'text': [], 'retweets': [], 'created_at': [], 'tweet_id': [], 'user_screen_name': [] } for t in load_tweets(file): data['user_id'].append(t['user_id']) data['favorites'].append(t['favorites']) data['text'].append(t['text']) data['retweets'].append(t['retweets']) data['created_at'].append(t['created_at']) data['tweet_id'].append(t['tweet_id']) data['user_screen_name'].append(t['user_screen_name']) df = pd.DataFrame(data) df['created_at'] = pd.to_datetime(df['created_at'], unit='ms') df = df.set_index('created_at') df = df.sort_index(ascending=True) tp = TextProcessor() df['text_processed'] = tp.text_process(df.text.tolist(), hashtags=True) df['political'] = 0 file = file.replace('json', 'pck') df.to_pickle(file)
""" sort_tfidf = sorted(dic_tfidf, key=lambda x: x[1], reverse=True) sort_tf_log_idf = sorted(dic_tf_log_idf, key=lambda x: x[1], reverse=True) sort_tfidf_like = sorted(dic_tfidf_like, key=lambda x: x[1], reverse=True) plot_tfidfs(sort_tfidf, sort_tf_log_idf, sort_tfidf_like) n = 2000 plot_cloud(sort_tfidf,n,"dic_tfidf") plot_cloud(sort_tf_log_idf,n,"dic_tf_log_idf") plot_cloud(sort_tfidf_like,n,"dic_tfidf_like") dir_path = "/Users/lucasso/Documents/tweets/" tp = TextProcessor() tw_files = ([file for root, dirs, files in os.walk(dir_path) for file in files if file.endswith('.json') ]) tw_list = list() tweets = list() for tw_file in tw_files: with open(dir_path+tw_file) as data_file: doc_list = list() for line in data_file: tweet = json.loads(line) doc_list.append(tweet['text']) tw_list.append(list(itertools.chain.from_iterable(tp.text_process(doc_list)))) for i in range(len(tw_list)): plot_dep_cloud(tw_list[i],sort_tfidf,n,tw_files[i]+"_dic_tfidf")
def get_text(self, row): guid = row['GUID'] text_processor = TextProcessor(guid, self.file_dictionary) return text_processor.get_text()
from text_processor import TextProcessor import configparser import pickle import numpy as np import csv import random if __name__ == '__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_btm = path['dir_btm'] dir_in = path['dir_in'] dir_out = path['dir_out'] dir_down = path['dir_down'] tp = TextProcessor() f = open(dir_down + "sanders-twitter-0.2/full-corpus.csv", "rt") twitter = csv.reader(f, delimiter=',') tweets = list() for tw in twitter: tweets.append(tw) random.shuffle(tweets) topic = list() txt = list() for tw in tweets: topic.append(tw[0]) txt.append(tw[4])
def days2time(days): #1380844800000 = 04/10/2013, 86400000 = 1 day return 1380844800000+(days*86400000) if __name__=='__main__': dir_out = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/grafos/" name = "74173_DeputadoEduardoCunha" filedir = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/coleta_pedro/"+name+".json" lamb_dir = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/coleta_pedro/lambdas/lambdats/"+name+"_wsize7.dat" tp = TextProcessor() with open(filedir) as data_file: doc_set = list() doc_tw = set() dc =set() weeks = list() dist = list() lamb = list() inicial = 1 final = 603 for line in data_file: tweet = json.loads(line) created = int(tweet['created_at']) if(days2time(inicial) <= created < days2time(final)): doc_tw.add(tweet['text'])
#model = gensim.models.LdaModel.load('android.lda') print(ldamodel.print_topics()) #ldamodel.print_topics() return ldamodel if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] tp = TextProcessor() with open(dir_out+"list_parl_tw_bi_trigrams2.pck",'rb') as handle: parl_tweets = pickle.load(handle) with open(dir_out+"tfidf_like_bi_trigrams.pck",'rb') as handle: tfidf_like_bi_trigrams = pickle.load(handle) dic_words = dict(sort_tfidf_like[1:15000]) list_tw_parl = list() for parl in parl_tweets: temp = list() for tw in parl: temp.append(list([x for x in tw if x in dic_words])) list_tw_parl.append(temp)
def main(argv=None): """ Training. """ ### parametres LEARNING_RATE = FLAGS.LEARNING_RATE NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES BATCH_SIZE = FLAGS.BATCH_SIZE EPOCH = FLAGS.EPOCH TRAINING_DEVICE = FLAGS.TRAINING_DEVICE VOCAB_SIZE = FLAGS.VOCAB_SIZE NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS HIDDEN_SIZE = FLAGS.HIDDEN_SIZE INPUT_SIZE = FLAGS.INPUT_SIZE NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS tsfm = transforms.Compose([ transforms.Resize([224, 224]), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) train_dir = FLAGS.train_dir #'D:/Machine_Learning/datasets/YouTubeClips_2/YouTubeClips/' train_corpus = FLAGS.train_corpus #'D:/Machine_Learning/datasets/video_corpus/video_corpus.csv' print("train_dir is =", train_dir) print("train_corpus =", train_corpus) utils = Utils() all_text = utils.output_text(train_corpus) text_processor = TextProcessor(freq_threshold=10) dictionary = text_processor.vocab_creator(all_text) ### training data preparation train_ds = CustomDataset(train_dir, train_corpus, device, dictionary, VOCAB_SIZE, NUMBER_OF_WORDS, INPUT_SIZE, NUMBER_OF_FRAMES, tsfm, model=md.model_vgg) train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE) ### Model definition encoder = Encoder_LSTM(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUMBER_OF_LAYERS) decoder = Decoder_LSTM(input_size=VOCAB_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUMBER_OF_LAYERS, number_of_words=NUMBER_OF_WORDS) model_seq_to_seq = Seq2Seq(encoder, decoder).to(device) model = model_seq_to_seq ### load the state_dict of model if model has been pretrained. if (FLAGS.load_weights): print("there are weights to be loaded") model.load_state_dict(torch.load(FLAGS.load_weights)) ### optimizer and loss function optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) criterion = nn.CrossEntropyLoss() #### Model Training import time print_feq = 1 best_loss = np.inf for epoch in range(1, EPOCH + 1): model.train() epoch_loss = 0 for step, (img, label) in enumerate(train_dl): time_1 = time.time() ## timing X_1, X_2 = img ### get inputs X_1 = X_1.to(device) # Set device X_2 = X_2.to(device) # Set device label = label.to(device) # Set output device ### zero the parameter gradients optimizer.zero_grad() ### forward prediction = model(X_1, X_2) ### Optimize prediction = prediction.to(device) prediction = torch.squeeze(prediction, 0) label = torch.squeeze(label, 0) new_label = torch.zeros([label.shape[0]]) for l in range(label.shape[0]): new_label[l] = np.argmax(label[l].cpu()) new_label = new_label.to(device) loss = criterion(prediction, new_label.long()) # Backward prop. loss.backward() optimizer.step() ### print out statistics epoch_loss += loss.item() if step % print_feq == 0: print('epoch:', epoch, '\tstep:', step + 1, '/', len(train_dl) + 1, '\ttrain loss:', '{:.4f}'.format(loss.item()), '\ttime:', '{:.4f}'.format( (time.time() - time_1) * print_feq), 's') ### save best model if (epoch_loss < best_loss): best_loss = epoch_loss model_name = 'MODEL_SEQ2SEQ' + 'VOCAB_SIZE_' + str( VOCAB_SIZE) + 'NUMBER_OF_WORDS_' + str( NUMBER_OF_WORDS ) + 'HIDDEN_SIZE_' + str(HIDDEN_SIZE) + 'INPUT_SIZE_' + str( INPUT_SIZE) + 'NUMBER_OF_LAYERS_' + str(NUMBER_OF_LAYERS) torch.save(model.state_dict(), model_name + '.pth') print("The loss for this epoch is = :", epoch_loss / len(train_dl))
parser.add_argument('-h5', default=H5_FILE) parser.add_argument('-npy', default=NPY_FILE) parser.add_argument('-vf', '--validationfile', required=True) args = parser.parse_args() H5_FILE = args.h5 NPY_FILE = args.npy VALIDATION_FILE = args.validationfile cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] X, y_true = load_validation_file_csv(VALIDATION_FILE) tp = TextProcessor() pc = PoliticalClassification(H5_FILE, NPY_FILE, 25) pol = '' n_pol = '' y_pred = list() X = tp.text_process(X, text_only=True) for tx in X: text = ' '.join(tx) if pc.is_political(text): pol += text + '\n' y_pred.append(1) else: n_pol += text + '\n' y_pred.append(0)
if __name__=='__main__': cf = configparser.ConfigParser() cf.read("../file_path.properties") path = dict(cf.items("file_path")) dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_tw = path['dir_tw'] print("load tweet files") fnames = ([file for root, dirs, files in os.walk(dir_tw) for file in files if file.endswith('.json') ]) categories_tw = list() tp = TextProcessor() for fl in fnames: categories_tw.append(tp.text_process(read_tweets(fl))) categories_counter = list() test_data = list() for categ in categories_tw: k = int(len(categ) * 0.2) random.shuffle(categ) tmp = list(itertools.chain.from_iterable(categ[k:])) categories_counter.append(Counter(tmp)) test_data.append(categ[:k]) print("process tfidf") tfidf_entropy = list()
for tw_file in tw_files: with open(dir_in+tw_file) as data_file: for line in data_file: tweet = json.loads(line) doc_list.append(tweet['text']) return doc_list if __name__=='__main__': dir_in= "/Users/lucasso/Documents/pck/" dir_ent = "/Users/lucasso/Documents/tweets_pedro/" dir_out= "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/" doc_list = load_files(dir_ent) tp = TextProcessor() tweets = tp.text_process(doc_list) word_list = set(load_file(dir_out,"word_list.pck")) #lista já processada sem entropia 0 e ration >1. remove todas as outras palavras que não interessam dos tweets tweets =[[i for i in t if i in word_list] for t in tweets] hashtags = re.compile(r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""") hs_set =set() hastgs_list = list() for tweet in tweets: v = ','.join(hashtags.findall( ' '.join(tweet))) l = hashtags.findall( ' '.join(tweet)) hastgs_list.append(l) hs_set |= set(v.split(",")) hastgs_list = [e for e in hastgs_list if e] # remove as listas em branco
import os import sys currentdir = os.path.dirname(os.path.realpath(__file__)) parentdir = os.path.dirname(currentdir) sys.path.append(parentdir + '/source') sys.path.append(parentdir) import csv from text_processor import TextProcessor #read csv, and split on "," the line csv_file = open('tweet_data/ShardiB2_2021-02-03 05_29_05.236426_tweets.csv', "r") text_pro = TextProcessor() x = open( "tweet_data/no_links or tickers_ShardiB2_2021-02-03 05_29_05.236426_tweets.csv", "w") #loop through the csv list for row in csv_file: replaced_row = text_pro.replace_tickers_with_company_names(row) #uncomment below to also remove links index = replaced_row.find("https") if index != -1: replaced_row = replaced_row[:index] + ", \n" x.writelines(replaced_row) x.close()
def __worker(self, pipe, l_log): """The core of the STT program, this is the multiprocessed part Note: Multiprocessing will require a pipe between the parent and child subprocess. Since this is the case, the worker subprocess cannot access non-shared variables """ l_log.debug("STT worker started") audio_processor = AudioProcessor( ) # Create a new audio processing object text_processor = TextProcessor( ) # Remember that we can't load the text processor nltk model until the nltk model is set from the client language config = Decoder.default_config( ) # Create a new pocketsphinx decoder with the default configuration, which is English decoder = None nltk_model = None mutex_flags = {"keyphrases": {"use": False}} shutdown_flags = {"shutdown": False, "decoder": None} def send_json(pipe, to_send): """Internal worker method to send a json through the parent socket Arguments: pipe (:obj: socket): The response pipe to send to the parent process to_send (:obj: dict): A dictionary to be sent to the parent socket """ try: ret = self.__send_buffered( pipe, to_send ) # Send the message passed by argument back to the parent process if not ret[0]: l_log.error( "Failed to send buffered message to the parent process! (err: %s)" % ret[1]) except Exception as err: l_log.error("Failed to send json! (err: %s)" % str(err)) def send_error(pipe, error): """Internal worker method to send a json error through the parent socket Arguments: pipe (:obj: socket): The response pipe to send to the parent process error (str): The string error message to send """ send_json(pipe, {"error": error}) def load_models(pipe, config, models): """Internal worker method to load the language model Note: Some lanaguages take a long time to load. English is by far the fastest language to be loaded as a model. Arguments: pipe (:obj: socket): The response pipe to send to the parent process models (dict): The language and nltk models developed by the parent process Returns: (Decoder) The STT decoder object and the nltk model """ language_model = models["language_model"] nltk_model = models["nltk_model"] if False in [ language_model.is_valid_model(), nltk_model.is_valid_model() ]: l_log.error("The language model %s is invalid!" % str(language_model.name)) send_error(pipe, "Failed loading language model!") return # Load the model configurations into pocketsphinx config.set_string('-hmm', str(language_model.hmm)) config.set_string('-lm', str(language_model.lm)) config.set_string('-dict', str(language_model.dict)) decoder = Decoder(config) send_json( pipe, {"success": True}) # Send a success message to the client l_log.debug("Set the language model to %s" % str(language_model.name)) return decoder, nltk_model # Return the new decoder and nltk model def process_text(pipe, text, is_final, args): """Internal worker method to process the Speech To Text phrase Arguments: pipe (:obj: socket): The response pipe to send to the parent process text (str): The spoken text to further process is_final (boo): If the text being processed is the final text else it's a partial result args (dict): Any other flags specifically required for a final or partial speech result """ generate_keyphrases = mutex_flags["keyphrases"]["use"] keyphrases = [] if generate_keyphrases: text_processor.generate_keyphrases( text) # Generate keyphrases from the given text keyphrases_list = text_processor.get_keyphrases() for keyphrase in keyphrases_list: to_append_keyphrase = { "score": keyphrase[0], "keyphrase": keyphrase[1] } keyphrases.append(to_append_keyphrase) else: keyphrases = text # Don't do any processing and just pass the text into the keyphrases # Generate the json to be sent back to the client hypothesis_results = args hypothesis_results["keyphrases"] = generate_keyphrases if is_final: hypothesis_results["hypothesis"] = keyphrases else: hypothesis_results["partial_hypothesis"] = keyphrases print(hypothesis_results) # Send the results back to the client send_json(pipe, hypothesis_results) def start_audio(pipe, decoder, args): """Internal worker method to start the audio processing chunk sequence Note: This must be called before the process_audio method or the STT engine will not process the audio chunks Arguments: pipe (:obj: socket): The response pipe to send to the parent process decoder (Decoder): The pocketsphinx decoder to control the STT engine args (dict): All of the available arguments passed by the parent process """ if decoder is None: l_log.error("Language model is not loaded") send_error(pipe, "Language model not loaded!") send_json(pipe, {"decoder": False}) return l_log.debug("Starting the audio processing...") decoder.start_utt() # Start the pocketsphinx listener # Tell the client that the decoder has successfully been loaded send_json(pipe, {"decoder": True}) def process_audio(pipe, decoder, args): """Internal worker method to process an audio chunk Note: The audio chunk is expected to be in base64 format Arguments: pipe (:obj: socket): The response pipe to send to the parent process decoder (Decoder): The pocketsphinx decoder to control the STT engine args (dict): All of the available arguments passed by the parent process """ if decoder is None: l_log.error("Language model is not loaded") send_error(pipe, "Language model not loaded!") return l_log.debug("Processing audio chunk!") audio_chunk = args["audio"] # Retrieve the audio data processed_wav = audio_processor.process_chunk( audio_chunk) # Process the base64 wrapped audio data l_log.debug("Recognizing speech...") decoder.process_raw( processed_wav, False, False) # Process the audio chunk through the STT engine hypothesis = decoder.hyp() # Get pocketshpinx's hypothesis # Send back the results of the decoding if hypothesis is None: l_log.debug("Silence detected") send_json(pipe, { "partial_silence": True, "partial_hypothesis": None }) else: hypothesis_results = { "partial_silence": False if len(hypothesis.hypstr) > 0 else True, } l_log.debug("Partial speech detected: %s" % str(hypothesis.hypstr)) process_text(pipe, hypothesis.hypstr, False, hypothesis_results) l_log.debug("Done decoding speech from audio chunk!") def stop_audio(pipe, decoder, args): """Internal worker method to stop the audio processing chunk sequence Note: This must be called after the process_audio method or the STT engine will continue to listen for audio chunks Arguments: pipe (:obj: socket): The response pipe to send to the parent process decoder (Decoder): The pocketsphinx decoder to control the STT engine args (dict): All of the available arguments passed by the parent process """ if decoder is None: l_log.error("Language model is not loaded") send_error(pipe, "Language model not loaded!") send_json({"decoder": False}) return l_log.debug("Stopping the audio processing...") decoder.end_utt() # Stop the pocketsphinx listener l_log.debug("Done recognizing speech!") hypothesis = decoder.hyp() # Get pocketshpinx's hypothesis logmath = decoder.get_logmath() # Send back the results of the decoding if hypothesis is None: l_log.debug("Silence detected") send_json(pipe, {"silence": True, "hypothesis": None}) else: hypothesis_results = { "silence": False if len(hypothesis.hypstr) > 0 else True, "score": hypothesis.best_score, "confidence": logmath.exp(hypothesis.prob) } l_log.debug("Speech detected: %s" % str(hypothesis.hypstr)) process_text(pipe, hypothesis.hypstr, True, hypothesis_results) def shutdown_thread(self, l_log): """Worker method to handle the checking of a shutdown call Note: To reduce overhead, this thread will only be called every 100 milliseconds """ while not shutdown_flags["shutdown"]: try: if self._shutdown_event.is_set(): l_log.debug("Shutting down worker thread!") shutdown_flags["shutdown"] = True # Exit the main loop if shutdown_flags["decoder"] is not None: try: shutdown_flags["decoder"].end_utt() except Exception as err: l_log.debug( "STT decoder object returned a non-zero status" ) else: l_log.warning( "The decoder object is already None!") break sleep(0.1) except Exception as err: l_log.error( "Failed shutting down worker thread! (err: %s)" % str(err)) shutdown_t = Thread(target=shutdown_thread, args=( self, l_log, )) shutdown_t.setDaemon(True) shutdown_t.start() p_out, p_in = pipe while not shutdown_flags["shutdown"]: try: try: command = self.__get_buffered( p_out) # Wait for a command from the parent process if "set_models" in command[ "exec"]: # Check to see if our command is to decoder, nltk_model = load_models( p_out, config, command["args"]) text_processor.set_nltk_model( nltk_model) # Set the text processor nltk model shutdown_flags["decoder"] = decoder elif "start_audio" in command["exec"]: start_audio(p_out, decoder, command["args"]) elif "process_audio" in command["exec"]: process_audio(p_out, decoder, command["args"]) elif "stop_audio" in command["exec"]: stop_audio(p_out, decoder, command["args"]) elif "set_keyphrases" in command["exec"]: mutex_flags["keyphrases"] = command["args"] else: l_log.error("Invalid command %s" % str(command)) send_error(socket, "Invalid command!") except (EOFError, IOError) as err: continue except Exception as err: l_log.error( "Failed recieving command from subprocess (id: %d) (err: %s)" % (current_process().pid, str(err)))
np.random.seed(SEED) print('W2VEC embedding: %s' % (W2VEC_MODEL_FILE)) print('Embedding Dimension: %d' % (EMBEDDING_DIM)) print('Allowing embedding learning: %s' % (str(LEARN_EMBEDDINGS))) cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_w2v = path['dir_w2v'] dir_in = path['dir_in'] word2vec_model = gensim.models.KeyedVectors.load_word2vec_format( dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore") tp = TextProcessor() texts = list() tx_class = list() tmp = list() with open(POLITICS_FILE) as l_file: for line in l_file: tmp.append(line) tx_class.append('politics') texts += tp.text_process(tmp, text_only=True) tmp = list() with open(NON_POLITICS_FILE) as l_file: for line in l_file:
parl_tw_list.append(temp) return doc_list, parl_tw_list if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_rob = path['dir_rob'] doc_list, parl_tw_list = load_files(dir_rob) tp = TextProcessor() parl_tw_processed = list() for l in parl_tw_list: parl_tw_processed.append(tp.text_process(l, text_only=True)) with open(dir_in+"coleta1.pck",'rb') as handle: coleta1 = pickle.load(handle) with open(dir_in+"coleta2.pck",'rb') as handle: coleta2 = pickle.load(handle) tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tw_processed)))) tot_counter = Counter(tweets)
if __name__ == "__main__": pp = pprint.PrettyPrinter(indent=4, depth=2) # Initialize classifier classifier = NaiveBayes() # Train for f in find("data/1/training"): f = f.strip() if not f.endswith(".txt"): continue with open(f) as doc: text = doc.read() sentences = nlp.process_text(text) label = "movie" if "movie" in f else "play" classifier.train(sentences, label=label) # Test for f in find("data/1/testing"): f = f.strip() if not f.endswith(".txt"): continue with open(f) as doc: text = doc.read() sentences = nlp.process_text(text)
class ArticleExtractor(object): """ Класс для подсчета частотностей сигнальных слов в элементах и вычисления элемента, содержащего новостную статью. """ def __init__(self, languge): self.settings = Settings() self.language = languge self.text_processor = TextProcessor(self.language) self.signalwords = self.settings.signal_words_ru def count_freqs(self, stemmed_text): """ Методы считает относительную частотность сигнальных слов в тексте. Возвращет список кортежей, первым элементов которого является слово, вторым его относительная частота. """ stemfreqs = defaultdict(int) for stem in stemmed_text: stemfreqs[stem] += 1 total_stems_in_text = float(len(stemmed_text)) stems = [(word, freq / total_stems_in_text) for word, freq in stemfreqs.iteritems()] sorted_freqs = sorted(((term, round(frequency, 3)) for term, frequency in stems), key=lambda w: w[1], reverse=True) return sorted_freqs def count_signalwords_in_html(self, texts): signalwords_to_text = {} for raw_element, stemmed_text in texts.iteritems(): signalwords_count = self.count_freqs(stemmed_text) signalwords_to_text[raw_element] = Counter({term[0]: term[1] for term in signalwords_count}) return signalwords_to_text def count_signalwords_in_file(self): signalwords = self.signalwords signalwords_terms = [term for term in self.text_processor.process_line(signalwords)] signalwords_freqs = self.count_freqs(signalwords_terms) return Counter({term[0]: term[1] for term in signalwords_freqs}) @staticmethod def cosine_similarity(html_freqdict, signalwords_freqdict): """ Вычисление косинусного коэффициента https://en.wikipedia.org/wiki/Cosine_similarity """ terms = set(html_freqdict.keys()).union(set(signalwords_freqdict.keys())) doc_vector = [html_freqdict[k] for k in terms] signals_vector = [signalwords_freqdict[k] for k in terms] doc_vector = numpy.asanyarray(doc_vector, dtype=float) signals_vector = numpy.asanyarray(signals_vector, dtype=float) dot_product = 0.0 for v1, v2 in zip(doc_vector, signals_vector): dot_product += v1*v2 magnitude_v1 = math.sqrt(sum(i1**2 for i1 in doc_vector)) magnitude_v2 = math.sqrt(sum(i2**2 for i2 in signals_vector)) if magnitude_v2 != 0 and magnitude_v1 != 0: return dot_product / (magnitude_v1 * magnitude_v2) else: return 0.0 def find_best_node(self, texts_stemmed): """ Для каждого элемента вычисляется косинусный коэффициент. Элементы сортируются по убыванию значения коэффициента. Возвращается ранжированный список кортежей, первый элемент которых - текст, второй - его кос. коэфф. """ result_dict = {} docs_index = self.count_signalwords_in_html(texts_stemmed) signals_freqdict = self.count_signalwords_in_file() for tag_element_text, element_terms_dict in docs_index.iteritems(): cossim = self.cosine_similarity(element_terms_dict, signals_freqdict) result_dict[tag_element_text] = cossim # mean = sum(result_dict.values()) / len(result_dict) relevant_elements = sorted(((text, cos) for text, cos in result_dict.iteritems()), key=lambda w: w[1], reverse=True) return relevant_elements
import json import pymongo import configparser import os from text_processor import TextProcessor import xlrd cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_xls = path['dir_xls'] client = pymongo.MongoClient("mongodb://localhost:27017") db = client.twitterdb tp = TextProcessor() tw_files = sorted([file for root, dirs, files in os.walk(dir_in) for file in files if file.endswith('.json')]) excel = True if excel: sheet_name = "nao_eleitos" col = 4 rep_dic = {} for fname in tw_files: rep_dic[fname.split('_',1)[0]] = fname xls = xlrd.open_workbook(dir_xls) sheet = xls.sheet_by_name(sheet_name) for i in range(sheet.nrows): id_rep = str(int(sheet.cell_value(rowx= i, colx=col))) if (id_rep in rep_dic):
def setText(self, text): tp = TextProcessor(text) self.text = tp.getProcessedText()
for tw in dep: tw_dic[str(month_tw(tw[0]))+"_"+str(i)]=tw[1] save_file(dir_out,tw[1],i,month_tw(tw[0])) random_pck =list() with open(dir_out+"random-pck/coleta1.pck",'rb') as handle: random_pck.append(pickle.load(handle)) with open(dir_out+"random-pck/coleta2.pck",'rb') as handle: random_pck.append(pickle.load(handle)) month_files = list() for m in range(10): month_files.append(load_files(dir_out+"tw_month/month_"+str(m)+"/")) tp = TextProcessor() month_processed =list() for tw in month_files: tmp = list() for dep in tw: tmp.append(tp.text_process(dep,text_only=True)) month_processed.append(tmp) ranked_month = list() for i,month in enumerate(month_processed): tmp = tfidf_month(month,random_pck) ranked_month.append(tmp) save_pck(dir_out+"tw_month/month_"+str(i)+"/",tmp) tfidf = TfIdf()
def process_words(): textProcessor = TextProcessor() processed_text = textProcessor.process(request.args.get('text')) return jsonify(response=processed_text)
#1380844800000 = 04/10/2013, 86400000 = 1 day return 1380844800000+(days*86400000) def days2timeInterval(day1, day2): #1380844800000 = 04/10/2013, 86400000 = 1 day return (1380844800000+(day1*86400000)), (1380844800000+(day2*86400000)) if __name__=='__main__': dir_in = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/coleta_pedro/" dir_out = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/" excel_path = "/Users/lucasso/Dropbox/Twitter_Marcelo/Arquivo Principal da Pesquisa - Quatro Etapas.xls" sheet_name = "amostra" col = 4 rt = ReadTwitter(dir_in, excel_path, sheet_name, col ) tp = TextProcessor() id_rep, names = rt.names_from_xls() for idx in range(len(names)): tweets = list() graphs = list() tw = nx.Graph() data = rt.tweets_election_data(id_rep[idx]) diction = {k:v for (k,v) in data.items()} for i in diction: tweets.append(list(itertools.chain.from_iterable(tp.text_process(diction[i].split()))))
def process_file(file_id): file = File.objects.get(pk=file_id) print(file) try: origin_path = file.origin_file.path except ValueError: origin_path = None file.input_type = get_input_type(file) print(file.input_type) file.progress += 10 file.save() sleep(.5) file.progress += 10 file.save() document = None if file.input_type == File.InputTypes.IMAGE: document = Document() text = image_to_text(origin_path) elif file.input_type == File.InputTypes.TEXTBOX: text = file.origin_text else: document = Document(origin_path, text_params) text = document.parse() # file.progress += 50 file.progress += 20 file.save() sleep(.5) file.progress += 10 file.save() sleep(.5) file.progress += 10 file.save() text_processor = TextProcessor() processed_text = text_processor.process(text) # file.progress += 30 file.progress += 20 file.save() sleep(.5) file.progress += 10 file.save() if file.input_type == File.InputTypes.TEXTBOX: file.processed_text = processed_text else: if document is None: raise ValueError('Error with document') output_name = get_output_field(file) document.change_text(processed_text) document.save(file.processed_file.storage.path(output_name)) file.processed_file = output_name sleep(.5) file.progress = 100 file.save() print(file)
def main(): device = torch.device('cuda') embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl') text_processor = TextProcessor( wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')), tokenizer=get_tokenizer('basic_english'), standardize=True, min_len=3, ) dataset = TextDataset(CORPUS_DIR, text_processor) # split into training and test set # TODO: fix this splitting sometimes failing when corpus size changes train_set, test_set = torch.utils.data.random_split( dataset, [ int(len(dataset) * DATA_SPLIT), int(len(dataset) * (1.0 - DATA_SPLIT)) ]) # count number of samples in each class class_count = [0, 0] for data, label in dataset: class_count[int(label.item())] += 1 # get relative weights for classes _sum = sum(class_count) class_count[0] /= _sum class_count[1] /= _sum # reverse the weights since we're getting the inverse for the sampler class_count = list(reversed(class_count)) # set weight for every sample weights = [class_count[int(x[1].item())] for x in train_set] # weighted sampler sampler = torch.utils.data.WeightedRandomSampler( weights=weights, num_samples=len(train_set), replacement=True) train_loader = DataLoader(dataset=train_set, batch_size=32, collate_fn=Sequencer(SEQUENCE_LEN), sampler=sampler) test_loader = DataLoader(dataset=test_set, batch_size=32, collate_fn=Sequencer(SEQUENCE_LEN)) # number of filters in each convolutional filter N_FILTERS = 64 # sizes and number of convolutional layers FILTER_SIZES = [2, 3] # dropout for between conv and dense layers DROPOUT = 0.5 model = TextCNN( embeddings=embedding_vectors, n_filters=N_FILTERS, filter_sizes=FILTER_SIZES, dropout=DROPOUT, ).to(device) print(model) print('Trainable params:', sum(p.numel() for p in model.parameters() if p.requires_grad)) criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) EPOCHS = 12 best_acc = 0.0 # training loop for epoch in range(EPOCHS): print('Epoch', epoch + 1) for i, data in tqdm(enumerate(train_loader), total=len(train_loader)): # get word indices vector and corresponding labels x, labels = data # send to device x = x.to(device) labels = labels.to(device) # make predictions predictions = model(x).squeeze() # calculate loss loss = criterion(predictions, labels) # learning stuff... optimizer.zero_grad() loss.backward() optimizer.step() # evaluate with torch.no_grad(): model.eval() correct = 0 wrong = 0 m = [[0, 0], [0, 0]] for data in test_loader: x, label = data x = x.to(device) predictions = model(x).squeeze() for truth, prediction in zip(label, predictions): y = int(truth.item()) y_pred = 1 if prediction.item() > 0.5 else 0 m[y][y_pred] += 1 if y == y_pred: correct += 1 else: wrong += 1 model.train() acc = correct / (correct + wrong) if acc > best_acc: best_acc = acc for file in glob.glob('models/model_*.pth'): os.remove(file) torch.save(model.state_dict(), f'models/state_{epoch}.pth') print() print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc) print('[[TN, FP], [FN, TP]]') print(m) print() # put into evaluation mode model.eval() text_processor.do_standardize = True with torch.no_grad(): while True: text = input('Prompt: ') x = text_processor.process(text) x = torch.tensor(x).unsqueeze(dim=0) print(model(x.to(device)).squeeze())
import nltk from nltk.stem.lancaster import LancasterStemmer from nltk.corpus import stopwords, brown from text_processor import TextProcessor from keyword_retrieval import KeywordRetrieval from datetime import datetime if __name__ == "__main__": text = raw_input("Please input your text (It's better not to input more than 15 words)\n>> ") # case based ontology print "Case: [[ Ontology-based method]]\nText :%s" % text # create text processor to get keywords start = datetime.now() text_processor = TextProcessor(text = text) keywords = text_processor.get_keywords() if keywords: dos = {} print "Extracted ontology keywords:", keywords for word in keywords: kr = KeywordRetrieval(keyword=word) dos[word] = kr.get_result() else: print "The model does not extract any keywords" end = datetime.now() time_1 = end - start for word in keywords: if dos[word]: # sorted by name's length(similarity) for do in sorted(dos[word], key=lambda do: len(do.name)): # print do's information print "_"*100
def form_valid(self, form): text_processor = TextProcessor() form.instance.processed_text = text_processor.process( form.cleaned_data['origin_text']) return super().form_valid(form)
return w1 == separator_word or w2 == separator_word or w3 == separator_word if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_pck = path['dir_pck'] doc_list, parl_tw_list = load_files(dir_in) _ ,list_aleatory = load_files(dir_ale) tp = TextProcessor() tweets = tp.text_process(doc_list,text_only=True) tw_words = add_separator(tweets) parl_bigrams = get_bigrams(tw_words,3,True) #processa os tweets de cada deputado parl_processed = list() parl_tri_processed = list() for l in parl_tw_list: temp = add_separator(tp.text_process(l,text_only=True)) parl_tri_processed.append(get_trigrams(temp, 3, True)) parl_processed.append(get_bigrams(temp,3,True)) with open(dir_out+"list_dept_bigrams_.pck", 'wb') as handle: pickle.dump(parl_processed, handle) with open(dir_out+"list_dept_trigrams_.pck", 'wb') as handle:
def main(argv=None): """ Training. """ ### parametres LEARNING_RATE = FLAGS.LEARNING_RATE NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES BATCH_SIZE = FLAGS.BATCH_SIZE EPOCH = FLAGS.EPOCH TRAINING_DEVICE = FLAGS.TRAINING_DEVICE VOCAB_SIZE = FLAGS.VOCAB_SIZE NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS HIDDEN_SIZE = FLAGS.HIDDEN_SIZE INPUT_SIZE = FLAGS.INPUT_SIZE NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS tsfm = transforms.Compose([ transforms.Resize([224, 224]), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) train_corpus = FLAGS.train_corpus utils = Utils() all_text = utils.output_text(train_corpus) text_processor = TextProcessor(freq_threshold=10) dictionary = text_processor.vocab_creator(all_text) ### Model definition encoder = Encoder_LSTM(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUMBER_OF_LAYERS) decoder = Decoder_LSTM(input_size=VOCAB_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUMBER_OF_LAYERS, number_of_words=NUMBER_OF_WORDS) model_seq_to_seq = Seq2Seq(encoder, decoder).to(device) model = model_seq_to_seq ### load the state_dict of model if model has been pretrained. model.load_state_dict(torch.load(FLAGS.load_weights)) #### Model Testing model.eval() from random import randint import matplotlib.pyplot as plt utils = Utils() video_path = FLAGS.video_file video_pre_data = utils.video_to_frames(video_path, frame_number=NUMBER_OF_FRAMES, device='cuda', INPUT_SIZE=INPUT_SIZE, model=md.model_vgg, transform=tsfm) X_2 = torch.zeros([NUMBER_OF_WORDS, VOCAB_SIZE]) for i in range(NUMBER_OF_WORDS): if (i == 0): X_2[i][2] = 1 else: X_2[i][1] = 1 input_data = video_pre_data.unsqueeze(0) final_sentence = [] X_2 = X_2.unsqueeze(0) X_2 = X_2.to(device) input_data = input_data.to(device) for i in range(NUMBER_OF_WORDS - 1): with torch.no_grad(): predicted = model(input_data, X_2) predicted = predicted.squeeze(0) final_sentence.append( next((key for key, value in dictionary.items() if value == torch.argmax(predicted[i])), None)) X_2[0][i + 1][torch.argmax(predicted[i])] = 1 X_2[0][i + 1][1] = 0 print(final_sentence)
#1380844800000 = 04/10/2013, 86400000 = 1 day return 1380844800000+(days*86400000) def days2timeInterval(day1, day2): #1380844800000 = 04/10/2013, 86400000 = 1 day return (1380844800000+(day1*86400000)), (1380844800000+(day2*86400000)) if __name__=='__main__': dir_in = "/Users/lucasso/Documents/tweets_pedro/" dir_out = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/" excel_path = "/Users/lucasso/Dropbox/Twitter_Marcelo/Arquivo Principal da Pesquisa - Quatro Etapas.xls" sheet_name = "amostra" col = 4 rt = ReadTwitter(dir_in, excel_path, sheet_name, col ) tp = TextProcessor() id_rep, names = rt.names_from_xls() parl_words = Counter() counter_list = list() tw_apriori = list() for idx in range(len(names)): tweets = list() data = rt.tweets_election_data(id_rep[idx]) for k,v in data.items(): tweets.append(tp.text_process(v.split())) tw_apriori += [[x[0] for x in e if x] for e in tweets if e ]
with open(dir_in+tw_file) as data_file: for line in data_file: tweet = json.loads(line) temp.append(tweet['text']) doc_list.append(temp) return doc_list, tw_files if __name__ == "__main__": cf = configparser.ConfigParser() cf.read("../file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_val'] tp = TextProcessor() tweets = list() doc_list, tw_files = load_files(dir_in) for txt in doc_list: print(len(doc_list)) tweets.append(tp.text_process(txt, text_only=True)) for i, fl in enumerate(tw_files): f = open(dir_in+"%s.txt" % fl.split('.')[0], 'w') for tw in tweets[i]: f.write(" ".join(tw) + "\n") f.close()
def setUp(self): self.tp = TextProcessor()
embeddings = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl') model = TextCNN( embeddings=embeddings, n_filters=64, filter_sizes=[2, 3], dropout=0.0, ) device = torch.device('cpu') model.load_state_dict(torch.load('model.pth', map_location=device)) model.eval() text_processing = TextProcessor( wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')), tokenizer=get_tokenizer('basic_english'), standardize=True, min_len=3, ) @app.post('/game') async def game(request: Request): q = request.form.get('q', None) if q is None: return HTTPResponse(status=400) tokens = text_processing.process(q) x = torch.unsqueeze(tokens, dim=0) pred = model(x)
class TextProcessorTest(unittest.TestCase): tp = None def setUp(self): self.tp = TextProcessor() def validate_phrases(self, phrases, action, args=[], urls=[]): has_args = len(args) > 0 has_urls = len(urls) > 0 curr_phrase = 0 curr_url = "google.com" for phrase in phrases: if has_urls: curr_url = urls[curr_phrase] web_action = self.tp.process_web_action_request(phrase, curr_url) self.assertNotEqual(web_action, None) self.assertTrue(web_action[h.CMD] in self.tp.action_text_mappings.keys()) self.assertEqual(action, web_action[h.CMD]) print phrase.strip() print web_action num_args = 0 if has_args: for arg in args[curr_phrase]: if arg in web_action['arguments'].values(): num_args += 1 self.assertEqual(len(args[curr_phrase]), num_args) num_nonempty_args = [x for x in web_action['arguments'].values() if x] self.assertEqual(num_args, len(num_nonempty_args)) curr_phrase += 1 def test_scroll_left(self): template_phrases = ["scroll left", "left scroll"] self.validate_phrases(template_phrases, h.SCROLL_LEFT) def test_scroll_right(self): template_phrases = ["scroll right", "right scroll"] self.validate_phrases(template_phrases, h.SCROLL_RIGHT) def test_scroll_up(self): template_phrases = ["scroll up", "up scroll", "scroll up one page", "scroll up ten pages"] args = [[1], [1], [1], [10]] self.validate_phrases(template_phrases, h.SCROLL_UP, args) def test_scroll_down(self): template_phrases = ["scroll down", "down scroll", "scroll down one page", "scroll down four pages"] args = [[1], [1], [1], [4]] self.validate_phrases(template_phrases, h.SCROLL_DOWN, args) def test_zoom_in(self): template_phrases = ["zoom in by ten percent", "zoom in 20 percent", "zoom", "zoom larger", "zoom in"] args = [[10], [20], [25], [25], [25]] self.validate_phrases(template_phrases, h.ZOOM_IN, args) def test_zoom_out(self): template_phrases = ["zoom out by fifty percent", "zoom out by thirty five percent", "zoom away 30 percent", "zoom out 100 percent", "zoom smaller", "zoom out"] args = [[50], [35], [30], [100], [25], [25]] self.validate_phrases(template_phrases, h.ZOOM_OUT, args) def test_open_new_tab(self): template_phrases = ["open tab Spotify", "open tab cnn", "open a tab facebook.com", "new tab google.com", "open a tab", "open a new tab", "new tab", "open new tab", "create tab", "create a new tab", "create new tab"] args = [["spotify"], ["cnn"], ['facebook.com'], ['google.com']] + [[] for i in range(len(template_phrases))] self.validate_phrases(template_phrases, h.OPEN_TAB, args) def test_close_tab(self): template_phrases = ["close tab three", "close tab facebook", "exit tab 2", "exit tab StackOverflow", "leave tab twelve", "leave tab google"] args = [[3], ["facebook"], [2], ["stackoverflow"], [12], ["google"]] self.validate_phrases(template_phrases, h.CLOSE_TAB, args) def test_switch_tab(self): template_phrases = ["switch to facebook", "switch to four", "switch to tab three", "switch to tab google.com", "switch to Facebook", "change to CNN", "open the twelfth tab", "switch to the first tab", "change to Facebook tab", "change to tab four", "change to Pandora", "change tab to tab 4", "change tab to the weather"] args = [["facebook"], [4], [3], ["google.com"], ["facebook"], ["cnn"], [12], [1], ["facebook"], [4], ["pandora"], [4], ["the weather"]] self.validate_phrases(template_phrases, h.SWITCH_TAB, args) def test_forward_page(self): # ["forward", "go forward", "go forward a page", template_phrases = ["go to the next page", "next page", "ahead a page", "forward a page", "one page forward", "page forward", "page ahead"] self.validate_phrases(template_phrases, h.FORWARD) def test_backward_page(self): template_phrases = ["back", "backward", "go backward", "go backward a page", "go back a page", "go to the previous page", "previous page", "back a page", "backward a page", "one page backward", "page backward", "page back"] self.validate_phrases(template_phrases, h.BACKWARD) def test_refresh_page(self): template_phrases = ["refresh the page", "refresh page", "page refresh", "refresh this page"] self.validate_phrases(template_phrases, h.REFRESH) def test_click_element(self): template_phrases = ["open link facebook", "click google doc link", "click link github.com", "open link w w w dot google dot com", "click github dot com", "click search", "click the home button", "click submit", "click more", "click sent mail", "click the submit button", "click the post button", "click the home button"] args = [['facebook'], ['google doc'], ['github.com'], ['www.google.com'], ['github.com'], ['search'], ['home'], ['submit'], ['more'], ['sent mail'], ['submit'], ['post'], ['home']] self.validate_phrases(template_phrases, h.CLICK, args) def test_open_url(self): template_phrases = ["open www.google.com in the current tab", "open facebook.com", "open new google.com", "open accuweather.com in this tab", "open pandora.com in this tab", "open youtube.com in this tab", "open spotify.com in this tab"] args = [['www.google.com', 'true'], ['facebook.com', 'false'], ['google.com', 'false'], ['accuweather.com', 'true'], ['pandora.com', 'true'], ['youtube.com', 'true'], ['spotify.com', 'true']] self.validate_phrases(template_phrases, h.OPEN_URL, args) def test_select_element(self): template_phrases = ["select search box", "select what are you interested in?", "select username", "select search", "select password", "select search facebook", "select what's on your mind?", "select write a comment..."] args = [['search box'], ['what are you interested in?'], ['username'], ['search'], ['password'], ['search facebook'], ['what\'s on your mind?'], ["write a comment..."]] self.validate_phrases(template_phrases, h.SELECT_ELEMENT, args) def test_enter_text(self): template_phrases = ["enter senior project has been a long process of testing", "enter text I feel great today for some reason", "write I feel great today and want to go on vacation", "enter text the wheels on the car are worth $2500"] args = [['senior project has been a long process of testing'], ['i feel great today for some reason'], ['i feel great today and want to go on vacation'], ['the wheels on the car are worth $2500']] self.validate_phrases(template_phrases, h.ENTER_TEXT, args) def test_submit_text(self): template_phrases = ["submit text", "submit"] args = [[], []] self.validate_phrases(template_phrases, h.SUBMIT_TEXT, args) def test_open_help(self): template_phrases = ["help please", "please help", "open help", "open browsing assistance", "browsing assistance", "assistance", "assistant", "helper", "help window", "help me", "show hints", "open hints", "display hints", "list functions", "list commands", "list actions", "show actions", "show commands"] self.validate_phrases(template_phrases, h.OPEN_HELP) def test_close_help(self): template_phrases = ["close help", "close help page", "hide commands", "hide help", "hide hints", "hide functions", "close browsing assistance"] self.validate_phrases(template_phrases, h.CLOSE_HELP) # start video context def test_play_video(self): template_phrases = ["play", "play video", "play movie", "start", "start video", "start movie"] urls = [youtube] * len(template_phrases) self.validate_phrases(template_phrases, h.PLAY_VIDEO, urls=urls) def test_pause_video(self): template_phrases = ["stop", "stop video", "stop movie", "stop youtube", "paws", "pause", "paws movie", "paws video", "paws youtube", "pause youtube", "pause video", "pause movie"] urls = [youtube] * len(template_phrases) self.validate_phrases(template_phrases, h.PAUSE_VIDEO, urls=urls) def test_next_video(self): template_phrases = ["next", "next video", "next movie", "next video in playlist", "next movie in playlist"] urls = [youtube] * len(template_phrases) self.validate_phrases(template_phrases, h.NEXT_VIDEO, urls=urls) def test_open_fullscreen(self): template_phrases = ["fullscreen", "full screen", "open fullscreen", "open full screen", "toggle fullscreen", "toggle full screen"] urls = [youtube] * len(template_phrases) self.validate_phrases(template_phrases, h.OPEN_FULLSCREEN, urls=urls) def test_close_fullscreen(self): template_phrases = ["close", "exit", "escape", "quit", "quit fullscreen", "close fullscreen", "close full screen", "exit fullscreen", "exit full screen", "toggle fullscreen off", "toggle full screen off"] urls = [youtube] * len(template_phrases) self.validate_phrases(template_phrases, h.CLOSE_FULLSCREEN, urls=urls) # start music context def test_play_music(self): template_phrases = ["play", "start", "play music", "play my music", "play song", "play tune", "start music", "start song", "start tune"] args = [['true'], ['false'], ['true'], ['false'], ['false'], ['false'], ['true'], ['false'], ['true']] urls = [spot, pandora, spot, pandora, pandora, pandora, spot, pandora, spot] self.validate_phrases(template_phrases, h.PLAY_MUSIC, args, urls=urls) def test_pause_music(self): template_phrases = ["pause", "pause music", "paws music", "paws", "paws song", "stop", "stop music", "stop my music", "stop song", "stop tune"] args = [['true'], ['false'], ['true'], ['false'], ['false'], ['false'], ['false'], ['true'], ['false'], ['true']] urls = [spot, pandora, spot, pandora, pandora, pandora, pandora, spot, pandora, spot] self.validate_phrases(template_phrases, h.PAUSE_MUSIC, args, urls=urls) def test_next_song(self): template_phrases = ["next", "next song", "next tune", "next on playlist", "next in playlist"] args = [['true'], ['false'], ['true'], ['false'], ['false'], ['true']] urls = [spot, pandora, spot, pandora, pandora, spot] self.validate_phrases(template_phrases, h.NEXT_SONG, args, urls=urls) def test_search_music(self): template_phrases = ["search artist Elvis Presley", "search artist Led Zeppelin", "search for artist red hot chili peppers", "search album the song remains the same", "search for album by the way", "search song star-spangled banner", "search for song Whole Lotta Love", "search for song one"] args = [['false', 'elvis presley', 'artist'], ['true', 'led zeppelin', 'artist'], ['false', 'red hot chili peppers', 'artist'], ['false', 'the song remains the same', 'album'], ['true', 'by the way', 'album'], ['true', 'star-spangled banner', 'song'], ['true', 'whole lotta love', 'song'], ['false', 'one', 'song']] urls = [pandora, spot, pandora, pandora, spot, spot, spot, pandora] self.validate_phrases(template_phrases, h.SEARCH_MUSIC, args, urls=urls) # start doc context def test_go_to_page_pdf(self): template_phrases = ["go to page four hundred five", "go to page sixty seven", "go to two thousand seven hundred fifty three"] args = [[405], [67], [2753]] urls = [pdf] * 3 self.validate_phrases(template_phrases, h.GO_TO_PDF_PAGE, args, urls=urls)
CLASS_WEIGHT = args.class_weight N_ESTIMATORS = int(args.estimators) LOSS_FUN = args.loss KERNEL = args.kernel print('Word2Vec embedding: %s' %(W2VEC_MODEL_FILE)) print('Embedding Dimension: %d' %(EMBEDDING_DIM)) cf = configparser.ConfigParser() cf.read("../file_path.properties") path = dict(cf.items("file_path")) dir_w2v = path['dir_w2v'] dir_in = path['dir_in'] word2vec_model = gensim.models.Word2Vec.load(dir_w2v+W2VEC_MODEL_FILE) tp = TextProcessor() doc_list, tw_class = load_files(dir_in) tweets = tp.text_process(doc_list, text_only=True) tweets = select_tweets(tweets) X, Y = gen_data(tweets, tw_class) model = classification_model(X, Y, MODEL_TYPE) joblib.dump(model, dir_in + MODEL_TYPE + '.skl') # python BoWV.py --model logistic --seed 42 -f model_word2vec -d 100 --folds 10 # python BoWV.py --model gradient_boosting --seed 42 -f model_word2vec -d 100 --loss deviance --folds 10 # python BoWV.py --model random_forest --seed 42 -f model_word2vec -d 100 --estimators 20 --folds 10 # python BoWV.py --model svm_linear --seed 42 -f model_word2vec -d 100 --loss squared_hinge --folds 10 # python BoWV.py --model svm --seed 42 -f model_word2vec -d 100 --kernel rbf --folds 10
if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_pck = path['dir_pck'] doc_list, parl_tw_list = load_files(dir_in) _ ,list_aleatory = load_files(dir_ale) tp = TextProcessor() tweets = tp.text_process(doc_list, text_only=True) parl_tw_processed = list() for l in parl_tw_list: parl_tw_processed.append(tp.text_process(l, text_only=True)) alea_tw_processed = list() for l in list_aleatory: alea_tw_processed.append(tp.text_process(l, text_only=True)) for i,l in enumerate(alea_tw_processed): alea_tw_processed[i] = [n for n in l if n] with open(dir_out+"bgr_tfidf_like.pck",'rb') as handle: parl_bigrams = pickle.load(handle)