def set_values_to_db(self): json_files = self.get_json_files() print_green(json_files) for json_file in json_files: print_green(json_file) with open(json_file, 'r') as temp_f: col_count = [len(li.split(",")) for li in temp_f.readlines()] column_names = [i for i in range(0, max(col_count))] dataset = pd.read_csv(json_file, header=None, delimiter=",", names=column_names, error_bad_lines=False) self.dataset = dataset for index, data in dataset.iterrows(): print_yellow(f"index => {index}") parent_verb = None for i, d in enumerate(data): if not pd.isnull(d): if i == 0: parent_verb = d.strip() else: verb = d.strip() print_magenta(f"parent_verb => {parent_verb}") print_blue(f"verb => {verb}") synonym_id = self.__find_synonym(verb) print_gray(f"synonym_id => {synonym_id}") self.__set_synonyms(synonym_id, verb, parent_verb)
def set_values_to_db(self): json_files = self.get_json_files() for json_file in json_files: print_green(json_file) translate_language = self.__find_language(json_file) print_cyan(f"translate language => {translate_language}") with open(json_file, 'r') as temp_f: col_count = [len(li.split(",")) for li in temp_f.readlines()] column_names = [i for i in range(0, max(col_count))] dataset = pd.read_csv(json_file, header=None, delimiter="\t", names=column_names, error_bad_lines=False) self.dataset = dataset for index, data in dataset.iterrows(): print_yellow(f"index => {index}") non_translate = None for i, d in enumerate(data): if not pd.isnull(d): if i == 0: non_translate = d elif i == 1: translate = d translate_id = self.__find_translation( translate, non_translate) print_gray(f"translate_id => {translate_id}") if translate_id: self.__update_translation( translate, non_translate, "en", translate_language, translate_id) else: self.__set_translation(translate, non_translate, "en", translate_language)
def _load_json(self, file_name, encoding='utf-8', errors='ignore'): try: with open(file_name, encoding=encoding, errors=errors) as data_file: parser = JsonComment(json) data = parser.load(data_file) return data except Exception as e: print_yellow(f"cannot load json from {file_name}, {e}")
def transaction_bldr(self, query): self.sql_transaction.append(query) if len(self.sql_transaction) > 1000: if self.cursor is None: self.cursor, self.connection = self.get_cursor() self.cursor.execute("BEGIN TRANSACTION") for s in self.sql_transaction: try: self.cursor.execute(s) except Exception as e: print_yellow(f"cannot execute query {s}, {str(e)}") self.connection.commit() self.sql_transaction = []
def set_values_to_db(self): json_files = self.get_json_files() for json_file in json_files: print_green(json_file) with open(json_file, buffering=1000, encoding='iso-8859-1') as f: j = json.load(f) print_blue(j) participants = j["participants"] for index, message in enumerate(reversed(j["messages"])): sender_name = self.convert_encoding(message["sender_name"]) timestamp_ms = message["timestamp_ms"] message_type = self.convert_encoding(message["type"]) sender_message_name = self.convert_encoding( participants[0]['name']) if sender_message_name != "Konrad Uciechowski": guid = f"{index}_{sender_message_name}_{timestamp_ms}" else: guid = f"{index}_facebook_user_{timestamp_ms}" print_cyan(guid) try: content = self.convert_encoding(message["content"]) except Exception as e: print_yellow(f"cannot get content, {e}") content = "" try: photos = message["photos"] photos_list = list() for photo in photos: photo_uri = photo["uri"] photos_list.append(photo_uri) photo_str = ', '.join( [str(elem) for elem in photos_list]) except Exception as e: print_yellow(f"cannot get photos, {e}") photo_str = "" exists = self.__find_message(guid) if exists: self.__update_message(guid, sender_name, timestamp_ms, message_type, content, photo_str) else: self.__set_message(guid, sender_name, timestamp_ms, message_type, content, photo_str)
def set_values_to_db(self): json_files = self.get_json_files() for json_file in json_files: print_green(json_file) with open(json_file, buffering=1000, encoding='iso-8859-1') as f: j = json.load(f) print_blue(j) conversations = j["conversations"] for conversation in conversations: participant_data = conversation["conversation"][ "conversation"]["participant_data"] participants = self.get_participants(participant_data) print_magenta(participants) events = conversation["events"] for event in events: conversation_id = event["conversation_id"]["id"] sender_id = event["sender_id"]["gaia_id"] fallback_name = participants[sender_id] timestamp = event["timestamp"] event_id = event["event_id"] try: segment = event["chat_message"]["message_content"][ "segment"][0] event_type = segment["type"] text = self.convert_encoding(segment["text"]) except Exception as e: print_yellow( f"cannot find segment in {event}, {e}") print_green( f"conversation_id => {conversation_id}, sender_id => {sender_id}, fallback_name => {fallback_name}, timesttamp => {timestamp}, event_id => {event_id}, event_type => {event_type}, text => {text}" ) message = self.__find_message(event_id) if message: self.__update_message(event_id, text, sender_id, fallback_name, timestamp, conversation_id, event_type) else: self.__set_message(event_id, text, sender_id, fallback_name, timestamp, conversation_id, event_type)
def set_values_to_db(self): row_counter = 0 paired_rows = 0 f_list = self._get_dir_files(self.destination_folder) dir_path = Path(__file__).parent.parent.parent.parent for f_name in f_list: if f_name.suffix == '.json': file = f"{dir_path}/{f_name}" print_blue(file) with open(file, buffering=1000) as f: for data in f: element = json.loads(data) # for element in data: row_counter += 1 print_blue(element) parent_id = element['parent_id'] body = self.__format_data(element['body']) created_utc = element['created_utc'] score = element['score'] try: comment_id = element['name'] except Exception as e: print_yellow( f"comment id by name do not exists, {e}") try: comment_id = element['id'] except Exception as e: print_yellow( f"comment id by id do not exists, {e}") subreddit = element['subreddit'] parent_data = self.__find_parent(parent_id) print_green( f"parent_id => {parent_id}, body => {body}, created_utc => {created_utc}, comment_id => {comment_id}, subreddit => {subreddit}, parent_data => {parent_data}" ) if score >= 2: comment_score = self.__find_score(parent_id) if comment_score: if score > comment_score: if self.__acceptable(body): self.insert_or_replace_comment( comment_id, parent_id, parent_data, body, subreddit, created_utc, score) else: if self.__acceptable(body): if parent_data: self.insert_parent( True, comment_id, parent_id, parent_data, body, subreddit, created_utc, score) paired_rows += 1 else: self.insert_parent( False, comment_id, parent_id, None, body, subreddit, created_utc, score) self.display_rows(row_counter, data, paired_rows) self.clean_rows(row_counter, data) else: print_red(f"file of name {f_name} is not a json file")
words = args.words destination = "data/unpacked" dataset = None print(f"source => {source}") print(f"sources => {sources}") if source is not None: compressor = set_compressor(source, destination) compressor.set_values_to_db() elif sources is not None and engine is None and lang is None: for s in sources: compressor = set_compressor(s, destination) if compressor is not None: print_yellow(f"running source {s} and setting values to db") compressor.set_values_to_db() dataset = compressor.dataset elif sources is not None and engine is not None and lang is None: for s in sources: compressor = set_compressor(s, destination) if compressor is not None: print_yellow(f"running source {s} and backend {engine}") compressor.run_backend(engine) elif sources is not None and engine is not None and lang is not None: for s in sources: compressor = set_compressor(s, destination) if compressor is not None: print_yellow(f"running source {s} and backend {engine}") compressor.run_backend(engine, lang) elif source is None and sources is None and words is not None:
def convert_encoding(self, data): try: data = data.encode('iso-8859-1').decode('utf-8') except Exception as e: print_yellow(f"cannot get encoding from {data}, {str(e)}") return data