Exemplo n.º 1
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         dataset = pd.read_csv(json_file,
                               delimiter='\t',
                               encoding='iso-8859-1')
         for index, data in dataset.iterrows():
             print_blue(data)
             article_title = data['ArticleTitle']
             question = data['Question']
             answer = data['Answer']
             difficulty_from_questioner = data['DifficultyFromQuestioner']
             difficulty_from_answerer = data['DifficultyFromAnswerer']
             article_file = data['ArticleFile']
             print_green(
                 f"article_title => {article_title}, question => {question}, answer => {answer}, difficulty_from_questioner => {difficulty_from_questioner}, difficulty_from_answerer => {difficulty_from_answerer}, article_file => {article_file}"
             )
             message = self.__find_message(article_title, question, answer)
             if message:
                 self.__update_message(article_title, question, answer,
                                       difficulty_from_questioner,
                                       difficulty_from_answerer,
                                       article_file)
             else:
                 self.__set_message(article_title, question, answer,
                                    difficulty_from_questioner,
                                    difficulty_from_answerer, article_file)
Exemplo n.º 2
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         translate_language = self.__find_language(json_file)
         print_cyan(f"translate language => {translate_language}")
         with open(json_file, 'r') as temp_f:
             col_count = [len(li.split(",")) for li in temp_f.readlines()]
         column_names = [i for i in range(0, max(col_count))]
         dataset = pd.read_csv(json_file,
                               header=None,
                               delimiter="\t",
                               names=column_names,
                               error_bad_lines=False)
         self.dataset = dataset
         for index, data in dataset.iterrows():
             print_yellow(f"index => {index}")
             non_translate = None
             for i, d in enumerate(data):
                 if not pd.isnull(d):
                     if i == 0:
                         non_translate = d
                     elif i == 1:
                         translate = d
                         translate_id = self.__find_translation(
                             translate, non_translate)
                         print_gray(f"translate_id => {translate_id}")
                         if translate_id:
                             self.__update_translation(
                                 translate, non_translate, "en",
                                 translate_language, translate_id)
                         else:
                             self.__set_translation(translate,
                                                    non_translate, "en",
                                                    translate_language)
Exemplo n.º 3
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     print_green(json_files)
     for json_file in json_files:
         print_green(json_file)
         with open(json_file, 'r') as temp_f:
             col_count = [len(li.split(",")) for li in temp_f.readlines()]
         column_names = [i for i in range(0, max(col_count))]
         dataset = pd.read_csv(json_file,
                               header=None,
                               delimiter=",",
                               names=column_names,
                               error_bad_lines=False)
         self.dataset = dataset
         for index, data in dataset.iterrows():
             print_yellow(f"index => {index}")
             parent_verb = None
             for i, d in enumerate(data):
                 if not pd.isnull(d):
                     if i == 0:
                         parent_verb = d.strip()
                     else:
                         verb = d.strip()
                         print_magenta(f"parent_verb => {parent_verb}")
                         print_blue(f"verb => {verb}")
                         synonym_id = self.__find_synonym(verb)
                         print_gray(f"synonym_id => {synonym_id}")
                         self.__set_synonyms(synonym_id, verb, parent_verb)
Exemplo n.º 4
0
 def __decompress_bz2_file(self, file, with_extension):
     unpacked_file = BZ2File(file)
     data = unpacked_file.read()
     filename = os.path.splitext(file)[0]
     file_name = f"{self.destination_folder}/{file.name}{with_extension}"
     open(file_name, 'wb').write(data)
     print_green(f"unpacked bz2 file completed to {file_name}")
Exemplo n.º 5
0
 def __decompress_xz_file(self, file, with_extension):
     with lzma.open(file) as compressed:
         filename = os.path.splitext(file)[0]
         file_name = f"{self.destination_folder}/{file.name}{with_extension}"
         with open(file_name, 'wb') as destination:
             shutil.copyfileobj(compressed, destination)
     print_green(f"unpacked xz file completed to {file_name}")
Exemplo n.º 6
0
 def __decompress_zst_file(self, file, with_extension):
     with open(file, 'rb') as compressed:
         decomp = ZstdDecompressor()
         filename = os.path.splitext(file)[0]
         file_name = f"{self.destination_folder}/{file.name}{with_extension}"
         with open(file_name, 'wb') as destination:
             decomp.copy_stream(compressed, destination)
     print_green(f"unpacked zst file completed to {file_name}")
Exemplo n.º 7
0
    def set_values_to_db(self):
        dataset = self.load_imessages_from_file()
        dataset = dataset.values.tolist()
        for index, row in enumerate(dataset):
            print_green(row)
            ROWID = row[0]
            guid = row[1]
            text = row[2]
            handle_id = row[3]
            service = row[4]
            account = row[5]
            date = row[6]
            date_read = row[7]
            date_delivered = row[8]
            is_delivered = row[9]
            is_finished = row[10]
            is_emote = row[11]
            is_from_me = row[12]
            is_empty = row[13]
            is_delayed = row[14]
            is_auto_reply = row[15]
            is_prepared = row[16]
            is_read = row[17]
            is_system_message = row[18]
            is_sent = row[19]
            has_dd_results = row[20]
            cache_has_attachments = row[21]
            item_type = row[22]
            group_title = row[23]
            is_expirable = row[24]
            message_source = row[25]
            ck_record_id = row[26]
            destination_caller = row[27]
            is_spam = row[28]

            message = self.__find_message(guid)
            if not message:
                self.__set_message(ROWID, guid, text, handle_id, service, account, date, date_read,
                                   date_delivered,
                                   is_delivered,
                                   is_finished, is_emote, is_from_me, is_empty, is_delayed, is_auto_reply, is_prepared,
                                   is_read,
                                   is_system_message, is_sent, has_dd_results, cache_has_attachments, item_type,
                                   group_title,
                                   is_expirable, message_source, ck_record_id, destination_caller, is_spam)
            else:
                self.__update_message(ROWID, guid, text, handle_id, service, account, date, date_read,
                                      date_delivered,
                                      is_delivered,
                                      is_finished, is_emote, is_from_me, is_empty, is_delayed, is_auto_reply,
                                      is_prepared, is_read,
                                      is_system_message, is_sent, has_dd_results, cache_has_attachments, item_type,
                                      group_title,
                                      is_expirable, message_source, ck_record_id, destination_caller, is_spam)
Exemplo n.º 8
0
 def download_dataset(self, url=dataset_source_url):
     datasets = self.__get_dataset(url)
     for dataset in datasets:
         directory = Path(f"data/packed/{dataset}")
         try:
             if not directory.exists():
                 print_blue(f"downloading file {dataset}")
                 urllib2.urlretrieve(f"{url}{dataset}",
                                     f"data/packed/{dataset}",
                                     reporthook=report_hook)
                 print_green(f"{dataset} file downloaded successfully")
             else:
                 print_red(f"{dataset} already exists!")
         except Exception as e:
             print_red(
                 f"cannot download data from url {url}{dataset}, {str(e)}")
Exemplo n.º 9
0
    def set_values_to_db(self):
        json_files = self.get_json_files()
        for json_file in json_files:
            print_green(json_file)
            with open(json_file, buffering=1000, encoding='iso-8859-1') as f:
                j = json.load(f)
                print_blue(j)
                participants = j["participants"]
                for index, message in enumerate(reversed(j["messages"])):
                    sender_name = self.convert_encoding(message["sender_name"])
                    timestamp_ms = message["timestamp_ms"]
                    message_type = self.convert_encoding(message["type"])
                    sender_message_name = self.convert_encoding(
                        participants[0]['name'])
                    if sender_message_name != "Konrad Uciechowski":
                        guid = f"{index}_{sender_message_name}_{timestamp_ms}"
                    else:
                        guid = f"{index}_facebook_user_{timestamp_ms}"

                    print_cyan(guid)

                    try:
                        content = self.convert_encoding(message["content"])
                    except Exception as e:
                        print_yellow(f"cannot get content, {e}")
                        content = ""

                    try:
                        photos = message["photos"]
                        photos_list = list()
                        for photo in photos:
                            photo_uri = photo["uri"]
                            photos_list.append(photo_uri)
                        photo_str = ', '.join(
                            [str(elem) for elem in photos_list])
                    except Exception as e:
                        print_yellow(f"cannot get photos, {e}")
                        photo_str = ""

                    exists = self.__find_message(guid)
                    if exists:
                        self.__update_message(guid, sender_name, timestamp_ms,
                                              message_type, content, photo_str)
                    else:
                        self.__set_message(guid, sender_name, timestamp_ms,
                                           message_type, content, photo_str)
Exemplo n.º 10
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         with open(json_file, buffering=1000, encoding='iso-8859-1') as f:
             j = json.load(f)
             print_blue(j)
             conversations = j["conversations"]
             for conversation in conversations:
                 participant_data = conversation["conversation"][
                     "conversation"]["participant_data"]
                 participants = self.get_participants(participant_data)
                 print_magenta(participants)
                 events = conversation["events"]
                 for event in events:
                     conversation_id = event["conversation_id"]["id"]
                     sender_id = event["sender_id"]["gaia_id"]
                     fallback_name = participants[sender_id]
                     timestamp = event["timestamp"]
                     event_id = event["event_id"]
                     try:
                         segment = event["chat_message"]["message_content"][
                             "segment"][0]
                         event_type = segment["type"]
                         text = self.convert_encoding(segment["text"])
                     except Exception as e:
                         print_yellow(
                             f"cannot find segment in {event}, {e}")
                     print_green(
                         f"conversation_id => {conversation_id}, sender_id => {sender_id}, fallback_name => {fallback_name}, timesttamp => {timestamp}, event_id => {event_id}, event_type => {event_type}, text => {text}"
                     )
                     message = self.__find_message(event_id)
                     if message:
                         self.__update_message(event_id, text, sender_id,
                                               fallback_name, timestamp,
                                               conversation_id, event_type)
                     else:
                         self.__set_message(event_id, text, sender_id,
                                            fallback_name, timestamp,
                                            conversation_id, event_type)
Exemplo n.º 11
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         dataset = pd.read_csv(json_file, delimiter='\t')
         for index, data in dataset.iterrows():
             print_blue(data)
             question_id = data["QuestionID"]
             question = data["Question"]
             document_id = data["DocumentID"]
             document_title = data["DocumentTitle"]
             sentence_id = data["SentenceID"]
             sentence = data["Sentence"]
             label = data["Label"]
             print_green(
                 f"question_id => {question_id}, question => {question}, document_id => {document_id}, document_title => {document_title}, sentence_id => {sentence_id}, sentence => {sentence}, label => {label}")
             message = self.__find_message(document_id)
             if message:
                 self.__update_message(question_id, question, document_id, document_title,
                                       sentence_id, sentence, label)
             else:
                 self.__set_message(question_id, question, document_id, document_title,
                                    sentence_id, sentence, label)
Exemplo n.º 12
0
    def set_values_to_db(self):
        row_counter = 0
        paired_rows = 0
        f_list = self._get_dir_files(self.destination_folder)
        dir_path = Path(__file__).parent.parent.parent.parent
        for f_name in f_list:
            if f_name.suffix == '.json':
                file = f"{dir_path}/{f_name}"
                print_blue(file)

                with open(file, buffering=1000) as f:
                    for data in f:
                        element = json.loads(data)
                        # for element in data:
                        row_counter += 1
                        print_blue(element)
                        parent_id = element['parent_id']
                        body = self.__format_data(element['body'])
                        created_utc = element['created_utc']
                        score = element['score']
                        try:
                            comment_id = element['name']
                        except Exception as e:
                            print_yellow(
                                f"comment id by name do not exists, {e}")
                        try:
                            comment_id = element['id']
                        except Exception as e:
                            print_yellow(
                                f"comment id by id do not exists, {e}")
                        subreddit = element['subreddit']
                        parent_data = self.__find_parent(parent_id)
                        print_green(
                            f"parent_id => {parent_id}, body => {body}, created_utc => {created_utc}, comment_id => {comment_id}, subreddit => {subreddit}, parent_data => {parent_data}"
                        )
                        if score >= 2:
                            comment_score = self.__find_score(parent_id)
                            if comment_score:
                                if score > comment_score:
                                    if self.__acceptable(body):
                                        self.insert_or_replace_comment(
                                            comment_id, parent_id, parent_data,
                                            body, subreddit, created_utc,
                                            score)
                            else:
                                if self.__acceptable(body):
                                    if parent_data:
                                        self.insert_parent(
                                            True, comment_id, parent_id,
                                            parent_data, body, subreddit,
                                            created_utc, score)
                                        paired_rows += 1
                                    else:
                                        self.insert_parent(
                                            False, comment_id, parent_id, None,
                                            body, subreddit, created_utc,
                                            score)
                        self.display_rows(row_counter, data, paired_rows)
                        self.clean_rows(row_counter, data)
            else:
                print_red(f"file of name {f_name} is not a json file")