def main():
    # Directory where this script is located
    dirname = os.path.dirname(__file__)

    # List of terms to be ignored by the tokenizer
    ignore_terms = []

    # Collect the terms we want to ignore
    for ignore_file_name in IGNORE_TERMS_FILE_NAMES:
        with open(os.path.join(dirname, ignore_file_name)) as file:
            ignore_terms.extend(term.strip() for term in file)
            
    # Create our custom tokenizer, it receives the terms we want to ignore
    preprocessor = Preprocessor(word_chars='a-zA-Z0-9', inter_chars="'",
                                min_length=3, ignore=ignore_terms)
    
    for line in sys.stdin:
        bug_report = json.loads(line)
        old_title = bug_report['title']
        old_description = bug_report['description']

        bug_report['title'] = ' '.join(preprocessor.preprocess(old_title))
        bug_report['description'] = ' '.join(
            preprocessor.preprocess(old_description))

        print(json.dumps(bug_report))
예제 #2
0
파일: debug.py 프로젝트: pavsenin/aiodds
import pandas as pd
import time
from datetime import datetime, timedelta

from utils import Log
from db import DBProvider
from preprocessing import Preprocessor
from config_provider import config

db, log = DBProvider(), Log(config, 'debug')
preprocessor = Preprocessor(db, log)
preprocessor.preprocess()
예제 #3
0
 def preprocess(self):
     p = Preprocessor(self.dataset)
     p.preprocess()
     self.ground_truth = p.get_classification_data().values
     self.numerical = p.get_numerical()
예제 #4
0
db, log = DBProvider(), Log(config, 'update')

now_morning = datetime(now.year, now.month, now.day, 4)
start_time = time.time()
log.debug(f"Clear RAM {config.OS.clear_ram()}")
log.debug(f'Current DB is {config.Database}')
log.debug(f"Update current_matches to {now_morning}")
updater = CurrentUpdater(LeagueScraper(),
                         MatchScraper(from_time=None, to_time=now_morning), db,
                         log)
updater.update()
log.debug(f"Updated current_matches for {int(time.time() - start_time)} sec")

next_day = now + timedelta(days=1)
next_day_morning = datetime(next_day.year, next_day.month, next_day.day, 4)
start_time = time.time()
log.debug(f"Update future_matches from {now_morning} to {next_day_morning}")
updater = FutureUpdater(
    FutureLeagueScraper(),
    MatchScraper(from_time=now_morning, to_time=next_day_morning), db, log)
updater.update()
log.debug(f"Updated future_matches for {int(time.time() - start_time)} sec")

start_time = time.time()
log.debug(f"Clear RAM {config.OS.clear_ram()}")
preprocessor = Preprocessor(db, log)
log.debug('Preprocess matches')
num_matches = preprocessor.preprocess()
log.debug(
    f'Preprocessed {num_matches} matches for {int(time.time() - start_time)} sec'
)
예제 #5
0
# from attention import AttentionLayer
import tensorflow as tf
keras = tf.keras
from keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

if __name__ == "__main__":
    path = "/Users/seungyoungoh/workspace/text_summarization_project/"
    data = pd.read_csv(path + "/data/sample.csv", error_bad_lines=False)
    data = data.rename({
        'body': 'src',
        'key_point': 'smry'
    }, axis='columns')[['src', 'smry']]
    pr = Preprocessor(data)
    src_max_len, smry_max_len, src_vocab, smry_vocab, X_train, X_test, y_train, y_test = pr.preprocess(
    )

    # ### modeling
    # embedding_dim = 128
    # hidden_size = 256

    # # 인코더
    # encoder_inputs = Input(shape=(src_max_len,))

    # # 인코더의 임베딩 층
    # enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs)

    # # 인코더의 LSTM 1
    # encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
    # encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)