예제 #1
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     print_green(json_files)
     for json_file in json_files:
         print_green(json_file)
         with open(json_file, 'r') as temp_f:
             col_count = [len(li.split(",")) for li in temp_f.readlines()]
         column_names = [i for i in range(0, max(col_count))]
         dataset = pd.read_csv(json_file,
                               header=None,
                               delimiter=",",
                               names=column_names,
                               error_bad_lines=False)
         self.dataset = dataset
         for index, data in dataset.iterrows():
             print_yellow(f"index => {index}")
             parent_verb = None
             for i, d in enumerate(data):
                 if not pd.isnull(d):
                     if i == 0:
                         parent_verb = d.strip()
                     else:
                         verb = d.strip()
                         print_magenta(f"parent_verb => {parent_verb}")
                         print_blue(f"verb => {verb}")
                         synonym_id = self.__find_synonym(verb)
                         print_gray(f"synonym_id => {synonym_id}")
                         self.__set_synonyms(synonym_id, verb, parent_verb)
예제 #2
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         translate_language = self.__find_language(json_file)
         print_cyan(f"translate language => {translate_language}")
         with open(json_file, 'r') as temp_f:
             col_count = [len(li.split(",")) for li in temp_f.readlines()]
         column_names = [i for i in range(0, max(col_count))]
         dataset = pd.read_csv(json_file,
                               header=None,
                               delimiter="\t",
                               names=column_names,
                               error_bad_lines=False)
         self.dataset = dataset
         for index, data in dataset.iterrows():
             print_yellow(f"index => {index}")
             non_translate = None
             for i, d in enumerate(data):
                 if not pd.isnull(d):
                     if i == 0:
                         non_translate = d
                     elif i == 1:
                         translate = d
                         translate_id = self.__find_translation(
                             translate, non_translate)
                         print_gray(f"translate_id => {translate_id}")
                         if translate_id:
                             self.__update_translation(
                                 translate, non_translate, "en",
                                 translate_language, translate_id)
                         else:
                             self.__set_translation(translate,
                                                    non_translate, "en",
                                                    translate_language)
예제 #3
0
 def _load_json(self, file_name, encoding='utf-8', errors='ignore'):
     try:
         with open(file_name, encoding=encoding,
                   errors=errors) as data_file:
             parser = JsonComment(json)
             data = parser.load(data_file)
             return data
     except Exception as e:
         print_yellow(f"cannot load json from {file_name}, {e}")
예제 #4
0
 def transaction_bldr(self, query):
     self.sql_transaction.append(query)
     if len(self.sql_transaction) > 1000:
         if self.cursor is None:
             self.cursor, self.connection = self.get_cursor()
         self.cursor.execute("BEGIN TRANSACTION")
         for s in self.sql_transaction:
             try:
                 self.cursor.execute(s)
             except Exception as e:
                 print_yellow(f"cannot execute query {s}, {str(e)}")
         self.connection.commit()
         self.sql_transaction = []
예제 #5
0
    def set_values_to_db(self):
        json_files = self.get_json_files()
        for json_file in json_files:
            print_green(json_file)
            with open(json_file, buffering=1000, encoding='iso-8859-1') as f:
                j = json.load(f)
                print_blue(j)
                participants = j["participants"]
                for index, message in enumerate(reversed(j["messages"])):
                    sender_name = self.convert_encoding(message["sender_name"])
                    timestamp_ms = message["timestamp_ms"]
                    message_type = self.convert_encoding(message["type"])
                    sender_message_name = self.convert_encoding(
                        participants[0]['name'])
                    if sender_message_name != "Konrad Uciechowski":
                        guid = f"{index}_{sender_message_name}_{timestamp_ms}"
                    else:
                        guid = f"{index}_facebook_user_{timestamp_ms}"

                    print_cyan(guid)

                    try:
                        content = self.convert_encoding(message["content"])
                    except Exception as e:
                        print_yellow(f"cannot get content, {e}")
                        content = ""

                    try:
                        photos = message["photos"]
                        photos_list = list()
                        for photo in photos:
                            photo_uri = photo["uri"]
                            photos_list.append(photo_uri)
                        photo_str = ', '.join(
                            [str(elem) for elem in photos_list])
                    except Exception as e:
                        print_yellow(f"cannot get photos, {e}")
                        photo_str = ""

                    exists = self.__find_message(guid)
                    if exists:
                        self.__update_message(guid, sender_name, timestamp_ms,
                                              message_type, content, photo_str)
                    else:
                        self.__set_message(guid, sender_name, timestamp_ms,
                                           message_type, content, photo_str)
예제 #6
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         with open(json_file, buffering=1000, encoding='iso-8859-1') as f:
             j = json.load(f)
             print_blue(j)
             conversations = j["conversations"]
             for conversation in conversations:
                 participant_data = conversation["conversation"][
                     "conversation"]["participant_data"]
                 participants = self.get_participants(participant_data)
                 print_magenta(participants)
                 events = conversation["events"]
                 for event in events:
                     conversation_id = event["conversation_id"]["id"]
                     sender_id = event["sender_id"]["gaia_id"]
                     fallback_name = participants[sender_id]
                     timestamp = event["timestamp"]
                     event_id = event["event_id"]
                     try:
                         segment = event["chat_message"]["message_content"][
                             "segment"][0]
                         event_type = segment["type"]
                         text = self.convert_encoding(segment["text"])
                     except Exception as e:
                         print_yellow(
                             f"cannot find segment in {event}, {e}")
                     print_green(
                         f"conversation_id => {conversation_id}, sender_id => {sender_id}, fallback_name => {fallback_name}, timesttamp => {timestamp}, event_id => {event_id}, event_type => {event_type}, text => {text}"
                     )
                     message = self.__find_message(event_id)
                     if message:
                         self.__update_message(event_id, text, sender_id,
                                               fallback_name, timestamp,
                                               conversation_id, event_type)
                     else:
                         self.__set_message(event_id, text, sender_id,
                                            fallback_name, timestamp,
                                            conversation_id, event_type)
예제 #7
0
    def set_values_to_db(self):
        row_counter = 0
        paired_rows = 0
        f_list = self._get_dir_files(self.destination_folder)
        dir_path = Path(__file__).parent.parent.parent.parent
        for f_name in f_list:
            if f_name.suffix == '.json':
                file = f"{dir_path}/{f_name}"
                print_blue(file)

                with open(file, buffering=1000) as f:
                    for data in f:
                        element = json.loads(data)
                        # for element in data:
                        row_counter += 1
                        print_blue(element)
                        parent_id = element['parent_id']
                        body = self.__format_data(element['body'])
                        created_utc = element['created_utc']
                        score = element['score']
                        try:
                            comment_id = element['name']
                        except Exception as e:
                            print_yellow(
                                f"comment id by name do not exists, {e}")
                        try:
                            comment_id = element['id']
                        except Exception as e:
                            print_yellow(
                                f"comment id by id do not exists, {e}")
                        subreddit = element['subreddit']
                        parent_data = self.__find_parent(parent_id)
                        print_green(
                            f"parent_id => {parent_id}, body => {body}, created_utc => {created_utc}, comment_id => {comment_id}, subreddit => {subreddit}, parent_data => {parent_data}"
                        )
                        if score >= 2:
                            comment_score = self.__find_score(parent_id)
                            if comment_score:
                                if score > comment_score:
                                    if self.__acceptable(body):
                                        self.insert_or_replace_comment(
                                            comment_id, parent_id, parent_data,
                                            body, subreddit, created_utc,
                                            score)
                            else:
                                if self.__acceptable(body):
                                    if parent_data:
                                        self.insert_parent(
                                            True, comment_id, parent_id,
                                            parent_data, body, subreddit,
                                            created_utc, score)
                                        paired_rows += 1
                                    else:
                                        self.insert_parent(
                                            False, comment_id, parent_id, None,
                                            body, subreddit, created_utc,
                                            score)
                        self.display_rows(row_counter, data, paired_rows)
                        self.clean_rows(row_counter, data)
            else:
                print_red(f"file of name {f_name} is not a json file")
예제 #8
0
파일: main.py 프로젝트: Ludaxord/HelloChat
words = args.words
destination = "data/unpacked"

dataset = None

print(f"source => {source}")
print(f"sources => {sources}")

if source is not None:
    compressor = set_compressor(source, destination)
    compressor.set_values_to_db()
elif sources is not None and engine is None and lang is None:
    for s in sources:
        compressor = set_compressor(s, destination)
        if compressor is not None:
            print_yellow(f"running source {s} and setting values to db")
            compressor.set_values_to_db()
            dataset = compressor.dataset
elif sources is not None and engine is not None and lang is None:
    for s in sources:
        compressor = set_compressor(s, destination)
        if compressor is not None:
            print_yellow(f"running source {s} and backend {engine}")
            compressor.run_backend(engine)
elif sources is not None and engine is not None and lang is not None:
    for s in sources:
        compressor = set_compressor(s, destination)
        if compressor is not None:
            print_yellow(f"running source {s} and backend {engine}")
            compressor.run_backend(engine, lang)
elif source is None and sources is None and words is not None:
예제 #9
0
 def convert_encoding(self, data):
     try:
         data = data.encode('iso-8859-1').decode('utf-8')
     except Exception as e:
         print_yellow(f"cannot get encoding from {data}, {str(e)}")
     return data