def main(): # Directory where this script is located dirname = os.path.dirname(__file__) # List of terms to be ignored by the tokenizer ignore_terms = [] # Collect the terms we want to ignore for ignore_file_name in IGNORE_TERMS_FILE_NAMES: with open(os.path.join(dirname, ignore_file_name)) as file: ignore_terms.extend(term.strip() for term in file) # Create our custom tokenizer, it receives the terms we want to ignore preprocessor = Preprocessor(word_chars='a-zA-Z0-9', inter_chars="'", min_length=3, ignore=ignore_terms) for line in sys.stdin: bug_report = json.loads(line) old_title = bug_report['title'] old_description = bug_report['description'] bug_report['title'] = ' '.join(preprocessor.preprocess(old_title)) bug_report['description'] = ' '.join( preprocessor.preprocess(old_description)) print(json.dumps(bug_report))
import pandas as pd import time from datetime import datetime, timedelta from utils import Log from db import DBProvider from preprocessing import Preprocessor from config_provider import config db, log = DBProvider(), Log(config, 'debug') preprocessor = Preprocessor(db, log) preprocessor.preprocess()
def preprocess(self): p = Preprocessor(self.dataset) p.preprocess() self.ground_truth = p.get_classification_data().values self.numerical = p.get_numerical()
db, log = DBProvider(), Log(config, 'update') now_morning = datetime(now.year, now.month, now.day, 4) start_time = time.time() log.debug(f"Clear RAM {config.OS.clear_ram()}") log.debug(f'Current DB is {config.Database}') log.debug(f"Update current_matches to {now_morning}") updater = CurrentUpdater(LeagueScraper(), MatchScraper(from_time=None, to_time=now_morning), db, log) updater.update() log.debug(f"Updated current_matches for {int(time.time() - start_time)} sec") next_day = now + timedelta(days=1) next_day_morning = datetime(next_day.year, next_day.month, next_day.day, 4) start_time = time.time() log.debug(f"Update future_matches from {now_morning} to {next_day_morning}") updater = FutureUpdater( FutureLeagueScraper(), MatchScraper(from_time=now_morning, to_time=next_day_morning), db, log) updater.update() log.debug(f"Updated future_matches for {int(time.time() - start_time)} sec") start_time = time.time() log.debug(f"Clear RAM {config.OS.clear_ram()}") preprocessor = Preprocessor(db, log) log.debug('Preprocess matches') num_matches = preprocessor.preprocess() log.debug( f'Preprocessed {num_matches} matches for {int(time.time() - start_time)} sec' )
# from attention import AttentionLayer import tensorflow as tf keras = tf.keras from keras.layers import Input, LSTM, Embedding, Dense, Concatenate from keras.models import Model from keras.callbacks import EarlyStopping, ModelCheckpoint if __name__ == "__main__": path = "/Users/seungyoungoh/workspace/text_summarization_project/" data = pd.read_csv(path + "/data/sample.csv", error_bad_lines=False) data = data.rename({ 'body': 'src', 'key_point': 'smry' }, axis='columns')[['src', 'smry']] pr = Preprocessor(data) src_max_len, smry_max_len, src_vocab, smry_vocab, X_train, X_test, y_train, y_test = pr.preprocess( ) # ### modeling # embedding_dim = 128 # hidden_size = 256 # # 인코더 # encoder_inputs = Input(shape=(src_max_len,)) # # 인코더의 임베딩 층 # enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs) # # 인코더의 LSTM 1 # encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4) # encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)