def set_values_to_db(self): json_files = self.get_json_files() for json_file in json_files: print_green(json_file) dataset = pd.read_csv(json_file, delimiter='\t', encoding='iso-8859-1') for index, data in dataset.iterrows(): print_blue(data) article_title = data['ArticleTitle'] question = data['Question'] answer = data['Answer'] difficulty_from_questioner = data['DifficultyFromQuestioner'] difficulty_from_answerer = data['DifficultyFromAnswerer'] article_file = data['ArticleFile'] print_green( f"article_title => {article_title}, question => {question}, answer => {answer}, difficulty_from_questioner => {difficulty_from_questioner}, difficulty_from_answerer => {difficulty_from_answerer}, article_file => {article_file}" ) message = self.__find_message(article_title, question, answer) if message: self.__update_message(article_title, question, answer, difficulty_from_questioner, difficulty_from_answerer, article_file) else: self.__set_message(article_title, question, answer, difficulty_from_questioner, difficulty_from_answerer, article_file)
def set_values_to_db(self): json_files = self.get_json_files() for json_file in json_files: print_green(json_file) translate_language = self.__find_language(json_file) print_cyan(f"translate language => {translate_language}") with open(json_file, 'r') as temp_f: col_count = [len(li.split(",")) for li in temp_f.readlines()] column_names = [i for i in range(0, max(col_count))] dataset = pd.read_csv(json_file, header=None, delimiter="\t", names=column_names, error_bad_lines=False) self.dataset = dataset for index, data in dataset.iterrows(): print_yellow(f"index => {index}") non_translate = None for i, d in enumerate(data): if not pd.isnull(d): if i == 0: non_translate = d elif i == 1: translate = d translate_id = self.__find_translation( translate, non_translate) print_gray(f"translate_id => {translate_id}") if translate_id: self.__update_translation( translate, non_translate, "en", translate_language, translate_id) else: self.__set_translation(translate, non_translate, "en", translate_language)
def set_values_to_db(self): json_files = self.get_json_files() print_green(json_files) for json_file in json_files: print_green(json_file) with open(json_file, 'r') as temp_f: col_count = [len(li.split(",")) for li in temp_f.readlines()] column_names = [i for i in range(0, max(col_count))] dataset = pd.read_csv(json_file, header=None, delimiter=",", names=column_names, error_bad_lines=False) self.dataset = dataset for index, data in dataset.iterrows(): print_yellow(f"index => {index}") parent_verb = None for i, d in enumerate(data): if not pd.isnull(d): if i == 0: parent_verb = d.strip() else: verb = d.strip() print_magenta(f"parent_verb => {parent_verb}") print_blue(f"verb => {verb}") synonym_id = self.__find_synonym(verb) print_gray(f"synonym_id => {synonym_id}") self.__set_synonyms(synonym_id, verb, parent_verb)
def __decompress_bz2_file(self, file, with_extension): unpacked_file = BZ2File(file) data = unpacked_file.read() filename = os.path.splitext(file)[0] file_name = f"{self.destination_folder}/{file.name}{with_extension}" open(file_name, 'wb').write(data) print_green(f"unpacked bz2 file completed to {file_name}")
def __decompress_xz_file(self, file, with_extension): with lzma.open(file) as compressed: filename = os.path.splitext(file)[0] file_name = f"{self.destination_folder}/{file.name}{with_extension}" with open(file_name, 'wb') as destination: shutil.copyfileobj(compressed, destination) print_green(f"unpacked xz file completed to {file_name}")
def __decompress_zst_file(self, file, with_extension): with open(file, 'rb') as compressed: decomp = ZstdDecompressor() filename = os.path.splitext(file)[0] file_name = f"{self.destination_folder}/{file.name}{with_extension}" with open(file_name, 'wb') as destination: decomp.copy_stream(compressed, destination) print_green(f"unpacked zst file completed to {file_name}")
def set_values_to_db(self): dataset = self.load_imessages_from_file() dataset = dataset.values.tolist() for index, row in enumerate(dataset): print_green(row) ROWID = row[0] guid = row[1] text = row[2] handle_id = row[3] service = row[4] account = row[5] date = row[6] date_read = row[7] date_delivered = row[8] is_delivered = row[9] is_finished = row[10] is_emote = row[11] is_from_me = row[12] is_empty = row[13] is_delayed = row[14] is_auto_reply = row[15] is_prepared = row[16] is_read = row[17] is_system_message = row[18] is_sent = row[19] has_dd_results = row[20] cache_has_attachments = row[21] item_type = row[22] group_title = row[23] is_expirable = row[24] message_source = row[25] ck_record_id = row[26] destination_caller = row[27] is_spam = row[28] message = self.__find_message(guid) if not message: self.__set_message(ROWID, guid, text, handle_id, service, account, date, date_read, date_delivered, is_delivered, is_finished, is_emote, is_from_me, is_empty, is_delayed, is_auto_reply, is_prepared, is_read, is_system_message, is_sent, has_dd_results, cache_has_attachments, item_type, group_title, is_expirable, message_source, ck_record_id, destination_caller, is_spam) else: self.__update_message(ROWID, guid, text, handle_id, service, account, date, date_read, date_delivered, is_delivered, is_finished, is_emote, is_from_me, is_empty, is_delayed, is_auto_reply, is_prepared, is_read, is_system_message, is_sent, has_dd_results, cache_has_attachments, item_type, group_title, is_expirable, message_source, ck_record_id, destination_caller, is_spam)
def download_dataset(self, url=dataset_source_url): datasets = self.__get_dataset(url) for dataset in datasets: directory = Path(f"data/packed/{dataset}") try: if not directory.exists(): print_blue(f"downloading file {dataset}") urllib2.urlretrieve(f"{url}{dataset}", f"data/packed/{dataset}", reporthook=report_hook) print_green(f"{dataset} file downloaded successfully") else: print_red(f"{dataset} already exists!") except Exception as e: print_red( f"cannot download data from url {url}{dataset}, {str(e)}")
def set_values_to_db(self): json_files = self.get_json_files() for json_file in json_files: print_green(json_file) with open(json_file, buffering=1000, encoding='iso-8859-1') as f: j = json.load(f) print_blue(j) participants = j["participants"] for index, message in enumerate(reversed(j["messages"])): sender_name = self.convert_encoding(message["sender_name"]) timestamp_ms = message["timestamp_ms"] message_type = self.convert_encoding(message["type"]) sender_message_name = self.convert_encoding( participants[0]['name']) if sender_message_name != "Konrad Uciechowski": guid = f"{index}_{sender_message_name}_{timestamp_ms}" else: guid = f"{index}_facebook_user_{timestamp_ms}" print_cyan(guid) try: content = self.convert_encoding(message["content"]) except Exception as e: print_yellow(f"cannot get content, {e}") content = "" try: photos = message["photos"] photos_list = list() for photo in photos: photo_uri = photo["uri"] photos_list.append(photo_uri) photo_str = ', '.join( [str(elem) for elem in photos_list]) except Exception as e: print_yellow(f"cannot get photos, {e}") photo_str = "" exists = self.__find_message(guid) if exists: self.__update_message(guid, sender_name, timestamp_ms, message_type, content, photo_str) else: self.__set_message(guid, sender_name, timestamp_ms, message_type, content, photo_str)
def set_values_to_db(self): json_files = self.get_json_files() for json_file in json_files: print_green(json_file) with open(json_file, buffering=1000, encoding='iso-8859-1') as f: j = json.load(f) print_blue(j) conversations = j["conversations"] for conversation in conversations: participant_data = conversation["conversation"][ "conversation"]["participant_data"] participants = self.get_participants(participant_data) print_magenta(participants) events = conversation["events"] for event in events: conversation_id = event["conversation_id"]["id"] sender_id = event["sender_id"]["gaia_id"] fallback_name = participants[sender_id] timestamp = event["timestamp"] event_id = event["event_id"] try: segment = event["chat_message"]["message_content"][ "segment"][0] event_type = segment["type"] text = self.convert_encoding(segment["text"]) except Exception as e: print_yellow( f"cannot find segment in {event}, {e}") print_green( f"conversation_id => {conversation_id}, sender_id => {sender_id}, fallback_name => {fallback_name}, timesttamp => {timestamp}, event_id => {event_id}, event_type => {event_type}, text => {text}" ) message = self.__find_message(event_id) if message: self.__update_message(event_id, text, sender_id, fallback_name, timestamp, conversation_id, event_type) else: self.__set_message(event_id, text, sender_id, fallback_name, timestamp, conversation_id, event_type)
def set_values_to_db(self): json_files = self.get_json_files() for json_file in json_files: print_green(json_file) dataset = pd.read_csv(json_file, delimiter='\t') for index, data in dataset.iterrows(): print_blue(data) question_id = data["QuestionID"] question = data["Question"] document_id = data["DocumentID"] document_title = data["DocumentTitle"] sentence_id = data["SentenceID"] sentence = data["Sentence"] label = data["Label"] print_green( f"question_id => {question_id}, question => {question}, document_id => {document_id}, document_title => {document_title}, sentence_id => {sentence_id}, sentence => {sentence}, label => {label}") message = self.__find_message(document_id) if message: self.__update_message(question_id, question, document_id, document_title, sentence_id, sentence, label) else: self.__set_message(question_id, question, document_id, document_title, sentence_id, sentence, label)
def set_values_to_db(self): row_counter = 0 paired_rows = 0 f_list = self._get_dir_files(self.destination_folder) dir_path = Path(__file__).parent.parent.parent.parent for f_name in f_list: if f_name.suffix == '.json': file = f"{dir_path}/{f_name}" print_blue(file) with open(file, buffering=1000) as f: for data in f: element = json.loads(data) # for element in data: row_counter += 1 print_blue(element) parent_id = element['parent_id'] body = self.__format_data(element['body']) created_utc = element['created_utc'] score = element['score'] try: comment_id = element['name'] except Exception as e: print_yellow( f"comment id by name do not exists, {e}") try: comment_id = element['id'] except Exception as e: print_yellow( f"comment id by id do not exists, {e}") subreddit = element['subreddit'] parent_data = self.__find_parent(parent_id) print_green( f"parent_id => {parent_id}, body => {body}, created_utc => {created_utc}, comment_id => {comment_id}, subreddit => {subreddit}, parent_data => {parent_data}" ) if score >= 2: comment_score = self.__find_score(parent_id) if comment_score: if score > comment_score: if self.__acceptable(body): self.insert_or_replace_comment( comment_id, parent_id, parent_data, body, subreddit, created_utc, score) else: if self.__acceptable(body): if parent_data: self.insert_parent( True, comment_id, parent_id, parent_data, body, subreddit, created_utc, score) paired_rows += 1 else: self.insert_parent( False, comment_id, parent_id, None, body, subreddit, created_utc, score) self.display_rows(row_counter, data, paired_rows) self.clean_rows(row_counter, data) else: print_red(f"file of name {f_name} is not a json file")