def __update_synonym(self, verb, parent_verb, synonym_id): try: # verb = verb.replace("'", '"') # parent_verb = parent_verb.replace("'", '"') query = f"UPDATE synonyms_dictionary SET synonym = '{verb}', synonym_parent = '{parent_verb}' WHERE synonym_id = {synonym_id};" print_magenta(f"update => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {synonym_id}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __update_message(self, event_id, text, sender_id, fallback_name, timestamp, conversation_id, event_type): try: text = text.replace("'", '"') query = f"UPDATE hangouts_chat SET event_id = '{event_id}', text = '{text}', sender_id = {sender_id}, fallback_name = '{fallback_name}', timestamp = '{timestamp}', conversation_id = '{conversation_id}', type = {event_type} WHERE event_id = '{event_id}';" print_magenta(f"update => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {event_id}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __update_message(self, guid, sender_name, timestamp_ms, message_type, content, photos_list): try: content = content.replace("'", '"') query = f"UPDATE facebook_chat SET guid = '{guid}', sender_name = '{sender_name}', timestamp_ms = {timestamp_ms}, type = '{message_type}', text = '{content}', photos_list = '{photos_list}' WHERE guid = '{guid}';" print_magenta(f"update => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {guid}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __update_translation(self, translate, non_translate, language, translate_language, translate_id): try: translate = translate.replace("'", '"') non_translate = non_translate.replace("'", '"') query = f"UPDATE translator SET translate = '{translate}', non_translate = '{non_translate}', language = '{language}', translate_language = '{translate_language}' WHERE translate_id = {translate_id};" print_magenta(f"update => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {translate_id}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def decompress_folder(self, compress_folder): f_list = self._get_dir_files(compress_folder) dataset = pd.DataFrame() for file in f_list: try: j = self.decompress_file(file, ".json") csv, part_dataset = self.json_to_csv(j, True) dataset.join(part_dataset) except Exception as e: print_red(f"cannot decompress file {file}, {e}") return dataset
def __set_synonym(self, verb, parent_verb): try: # verb = verb.replace('"', "'") # parent_verb = parent_verb.replace('"', "'") query = """INSERT INTO synonyms_dictionary(synonym, synonym_parent) VALUES ("{}","{}")""".format( verb, parent_verb) print_cyan(f"set => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {verb}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def insert_or_replace_comment(self, comment_id, parent_id, parent, comment, subreddit, time, score): try: query = """UPDATE reddit_comments SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id = ?;""".format( parent_id, comment_id, parent, comment, subreddit, int(time), score, parent_id) print_magenta(f"update => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update comment on id {comment_id}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __set_translation(self, translate, non_translate, language, translate_language): try: translate = translate.replace('"', "'") non_translate = non_translate.replace('"', "'") query = """INSERT INTO translator(translate, non_translate, language, translate_language) VALUES ("{}","{}","{}","{}")""".format( translate, non_translate, language, translate_language) print_cyan(f"set => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {translate}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __set_message(self, guid, sender_name, timestamp_ms, message_type, content, photos_list): try: content = content.replace('"', "'") query = """INSERT INTO facebook_chat VALUES ("{}","{}","{}","{}","{}","{}")""".format( guid, content, sender_name, timestamp_ms, message_type, photos_list) print_cyan(f"set => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {guid}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __set_message(self, event_id, text, sender_id, fallback_name, timestamp, conversation_id, event_type): try: text = text.replace('"', "'") query = """INSERT INTO hangouts_chat VALUES ("{}","{}","{}","{}","{}","{}","{}")""".format( event_id, text, int(sender_id), fallback_name, int(timestamp), conversation_id, event_type) print_cyan(f"set => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {event_id}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __update_message(self, question_id, question, document_id, document_title, sentence_id, sentence, label): try: if type(question) is str: question = question.replace("'", '"') if type(sentence) is str: sentence = sentence.replace('"', '"') query = f"UPDATE wiki_questions_and_answers SET question_id = '{question_id}', question = '{question}', document_id = '{document_id}', document_title = '{document_title}', sentence_id = '{sentence_id}', sentence = '{sentence}', label = '{label}' WHERE document_id = '{document_id}' AND question = '{question}' AND answer = '{answer}';" print_magenta(f"update => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {document_id}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __update_message(self, article_title, question, answer, difficulty_from_questioner, difficulty_from_answerer, article_file): try: if type(question) is str: question = question.replace("'", '"') if type(answer) is str: answer = answer.replace('"', '"') query = f"UPDATE questions_and_answers SET article_title = '{article_title}', question = '{question}', answer = '{answer}', difficulty_from_questioner = '{difficulty_from_questioner}', difficulty_from_answerer = '{difficulty_from_answerer}', article_file = '{article_file}' WHERE article_title = '{article_title}' AND question = '{question}' AND answer = '{answer}';" print_magenta(f"update => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {article_title}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __set_message(self, question_id, question, document_id, document_title, sentence_id, sentence, label): try: if type(question) is str: question = question.replace('"', "'") if type(sentence) is str: sentence = sentence.replace('"', "'") query = """INSERT INTO wiki_questions_and_answers VALUES ("{}","{}","{}","{}","{}","{}","{}")""".format( question_id, question, document_id, document_title, sentence_id, sentence, label) print_cyan(f"set => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {document_id}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __find_message(self, pid): try: query = "SELECT text FROM imessage_chat WHERE guid = '{}' LIMIT 1".format(pid) if self.cursor is None: self.cursor, self.connection = self.get_cursor() self.cursor.execute(query) result = self.cursor.fetchone() if result is not None: return result[0] else: return False except Exception as e: print_red(f"cannot find message {str(e)}") self.cursor, self.connection = self.get_cursor() return False
def __update_message(self, ROWID, guid, text, handle_id, service, account, date, date_read, date_delivered, is_delivered, is_finished, is_emote, is_from_me, is_empty, is_delayed, is_auto_reply, is_prepared, is_read, is_system_message, is_sent, has_dd_results, cache_has_attachments, item_type, group_title, is_expirable, message_source, ck_record_id, destination_caller, is_spam): try: text = text.replace("'", '"') query = f"UPDATE imessage_chat SET ROWID = {ROWID}, guid = '{guid}', text = '{text}', handle_id = {int(handle_id)}, `date` = {int(date)}, date_read = {int(date_read)}, date_delivered = {int(date_delivered)}, is_delivered = {int(is_delivered)}, is_finished = {int(is_finished)}, is_emote = {int(is_emote)}, is_from_me = {int(is_from_me)}, is_empty = {int(is_empty)}, is_delayed = {int(is_delayed)}, is_auto_reply = {int(is_auto_reply)}, is_prepared = {int(is_prepared)}, is_read = {int(is_read)}, is_system_message = {int(is_system_message)}, is_sent = {int(is_sent)}, has_dd_results = {int(has_dd_results)}, is_spam = {int(is_spam)}, cache_has_attachments = {int(cache_has_attachments)}, item_type = {int(item_type)}, group_title = '{group_title}', is_expirable = {int(is_expirable)}, message_source = {int(message_source)}, destination_caller_id = '{destination_caller}', ck_record_id = '{ck_record_id}', account = '{account}', service = '{service}' WHERE guid = '{guid}';" print_magenta(f"update => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {guid}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __set_message(self, article_title, question, answer, difficulty_from_questioner, difficulty_from_answerer, article_file): try: if type(question) is str: question = question.replace('"', "'") if type(answer) is str: answer = answer.replace('"', "'") query = """INSERT INTO questions_and_answers VALUES ("{}","{}","{}","{}","{}","{}")""".format( article_title, question, answer, difficulty_from_questioner, difficulty_from_answerer, article_file) print_cyan(f"set => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {article_title}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __find_message(self, article_name, question, answer): try: query = "SELECT answer, question FROM questions_and_answers WHERE article_title = '{}' AND question = '{}' AND answer = '{}' LIMIT 1".format( article_name, question, answer) if self.cursor is None: self.cursor, self.connection = self.get_cursor() self.cursor.execute(query) result = self.cursor.fetchone() if result is not None: return result[0] else: return False except Exception as e: print_red(f"cannot find message {str(e)}") self.cursor, self.connection = self.get_cursor() return False
def download_dataset(self, url=dataset_source_url): datasets = self.__get_dataset(url) for dataset in datasets: directory = Path(f"data/packed/{dataset}") try: if not directory.exists(): print_blue(f"downloading file {dataset}") urllib2.urlretrieve(f"{url}{dataset}", f"data/packed/{dataset}", reporthook=report_hook) print_green(f"{dataset} file downloaded successfully") else: print_red(f"{dataset} already exists!") except Exception as e: print_red( f"cannot download data from url {url}{dataset}, {str(e)}")
def __find_synonym(self, verb): try: query = "SELECT synonym_id FROM synonyms_dictionary WHERE synonym = '{}' LIMIT 1".format( verb) if self.cursor is None: self.cursor, self.connection = self.get_cursor() self.cursor.execute(query) result = self.cursor.fetchone() if result is not None: return result[0] else: return False except Exception as e: print_red(f"cannot find synonym {str(e)}") self.cursor, self.connection = self.get_cursor() return False
def __find_message(self, document_id): try: query = "SELECT sentence FROM wiki_questions_and_answers WHERE document_id = '{}' LIMIT 1".format( document_id) if self.cursor is None: self.cursor, self.connection = self.get_cursor() self.cursor.execute(query) result = self.cursor.fetchone() if result is not None: return result[0] else: return False except Exception as e: print_red(f"cannot find message {str(e)}") self.cursor, self.connection = self.get_cursor() return False
def __find_parent(self, pid): try: query = "SELECT comment FROM reddit_comments WHERE comment_id = '{}' LIMIT 1".format( pid) if self.cursor is None: self.cursor, self.connection = self.get_cursor() self.cursor.execute(query) result = self.cursor.fetchone() if result is not None: return result[0] else: return False except Exception as e: print_red(f"cannot find parent {str(e)}") self.cursor, self.connection = self.get_cursor() return False
def insert_parent(self, has_parent, parent_id, comment_id, parent, comment, subreddit, time, score): try: query = """INSERT INTO reddit_comments """ if has_parent: query += """(parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}", "{}", "{}", "{}", "{}", "{}", "{}")""".format( parent_id, comment_id, parent, comment, subreddit, int(time), score) else: query += """(parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}", "{}", "{}", "{}", "{}", "{}")""".format( parent_id, comment_id, comment, subreddit, int(time), score) print_cyan(f"insert => {query}") self.transaction_bldr(query) except Exception as e: print_red( f"cannot insert parent comment of id {comment_id}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def __find_translation(self, translate, non_translate): try: translate = translate.replace("'", '"') non_translate = non_translate.replace("'", '"') query = "SELECT translate_id FROM translator WHERE translate = '{}' AND non_translate = '{}' LIMIT 1".format( translate, non_translate) if self.cursor is None: self.cursor, self.connection = self.get_cursor() self.cursor.execute(query) result = self.cursor.fetchone() if result is not None: return result[0] else: return False except Exception as e: print_red(f"cannot find synonym {str(e)}") self.cursor, self.connection = self.get_cursor() return False
def __set_message(self, ROWID, guid, text, handle_id, service, account, date, date_read, date_delivered, is_delivered, is_finished, is_emote, is_from_me, is_empty, is_delayed, is_auto_reply, is_prepared, is_read, is_system_message, is_sent, has_dd_results, cache_has_attachments, item_type, group_title, is_expirable, message_source, ck_record_id, destination_caller, is_spam): try: text = text.replace('"', "'") query = """INSERT INTO imessage_chat VALUES ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")""".format( guid, text, int(handle_id), int(date), int(date_read), int(date_delivered), int(is_delivered), int(is_finished), int(is_emote), int(is_from_me), int(is_empty), int(is_delayed), int(is_auto_reply), int(is_prepared), int(is_read), int(is_system_message), int(is_sent), int(has_dd_results), int(is_spam), int(cache_has_attachments), int(item_type), group_title, int(is_expirable), int(message_source), destination_caller, ck_record_id, service, account, ROWID) print_cyan(f"set => {query}") self.transaction_bldr(query) except Exception as e: print_red(f"cannot update message on id {guid}, {str(e)}") self.cursor, self.connection = self.get_cursor()
def set_values_to_db(self): row_counter = 0 paired_rows = 0 f_list = self._get_dir_files(self.destination_folder) dir_path = Path(__file__).parent.parent.parent.parent for f_name in f_list: if f_name.suffix == '.json': file = f"{dir_path}/{f_name}" print_blue(file) with open(file, buffering=1000) as f: for data in f: element = json.loads(data) # for element in data: row_counter += 1 print_blue(element) parent_id = element['parent_id'] body = self.__format_data(element['body']) created_utc = element['created_utc'] score = element['score'] try: comment_id = element['name'] except Exception as e: print_yellow( f"comment id by name do not exists, {e}") try: comment_id = element['id'] except Exception as e: print_yellow( f"comment id by id do not exists, {e}") subreddit = element['subreddit'] parent_data = self.__find_parent(parent_id) print_green( f"parent_id => {parent_id}, body => {body}, created_utc => {created_utc}, comment_id => {comment_id}, subreddit => {subreddit}, parent_data => {parent_data}" ) if score >= 2: comment_score = self.__find_score(parent_id) if comment_score: if score > comment_score: if self.__acceptable(body): self.insert_or_replace_comment( comment_id, parent_id, parent_data, body, subreddit, created_utc, score) else: if self.__acceptable(body): if parent_data: self.insert_parent( True, comment_id, parent_id, parent_data, body, subreddit, created_utc, score) paired_rows += 1 else: self.insert_parent( False, comment_id, parent_id, None, body, subreddit, created_utc, score) self.display_rows(row_counter, data, paired_rows) self.clean_rows(row_counter, data) else: print_red(f"file of name {f_name} is not a json file")