Пример #1
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         dataset = pd.read_csv(json_file,
                               delimiter='\t',
                               encoding='iso-8859-1')
         for index, data in dataset.iterrows():
             print_blue(data)
             article_title = data['ArticleTitle']
             question = data['Question']
             answer = data['Answer']
             difficulty_from_questioner = data['DifficultyFromQuestioner']
             difficulty_from_answerer = data['DifficultyFromAnswerer']
             article_file = data['ArticleFile']
             print_green(
                 f"article_title => {article_title}, question => {question}, answer => {answer}, difficulty_from_questioner => {difficulty_from_questioner}, difficulty_from_answerer => {difficulty_from_answerer}, article_file => {article_file}"
             )
             message = self.__find_message(article_title, question, answer)
             if message:
                 self.__update_message(article_title, question, answer,
                                       difficulty_from_questioner,
                                       difficulty_from_answerer,
                                       article_file)
             else:
                 self.__set_message(article_title, question, answer,
                                    difficulty_from_questioner,
                                    difficulty_from_answerer, article_file)
Пример #2
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     print_green(json_files)
     for json_file in json_files:
         print_green(json_file)
         with open(json_file, 'r') as temp_f:
             col_count = [len(li.split(",")) for li in temp_f.readlines()]
         column_names = [i for i in range(0, max(col_count))]
         dataset = pd.read_csv(json_file,
                               header=None,
                               delimiter=",",
                               names=column_names,
                               error_bad_lines=False)
         self.dataset = dataset
         for index, data in dataset.iterrows():
             print_yellow(f"index => {index}")
             parent_verb = None
             for i, d in enumerate(data):
                 if not pd.isnull(d):
                     if i == 0:
                         parent_verb = d.strip()
                     else:
                         verb = d.strip()
                         print_magenta(f"parent_verb => {parent_verb}")
                         print_blue(f"verb => {verb}")
                         synonym_id = self.__find_synonym(verb)
                         print_gray(f"synonym_id => {synonym_id}")
                         self.__set_synonyms(synonym_id, verb, parent_verb)
Пример #3
0
    def decompress_file(self, file_path, with_extension):
        file_path = Path(file_path)
        file_name = os.path.splitext(file_path)[0]
        print_blue(f"decompressing file: {file_name}{with_extension}")
        if file_path.suffix == ".bz2":
            self.__decompress_bz2_file(file_path, with_extension)
        elif file_path.suffix == ".zst":
            self.__decompress_zst_file(file_path, with_extension)
        elif file_path.suffix == ".xz":
            self.__decompress_xz_file(file_path, with_extension)
        file_name = f"{self.destination_folder}/{file_name}{with_extension}"

        return file_name
Пример #4
0
 def download_dataset(self, url=dataset_source_url):
     datasets = self.__get_dataset(url)
     for dataset in datasets:
         directory = Path(f"data/packed/{dataset}")
         try:
             if not directory.exists():
                 print_blue(f"downloading file {dataset}")
                 urllib2.urlretrieve(f"{url}{dataset}",
                                     f"data/packed/{dataset}",
                                     reporthook=report_hook)
                 print_green(f"{dataset} file downloaded successfully")
             else:
                 print_red(f"{dataset} already exists!")
         except Exception as e:
             print_red(
                 f"cannot download data from url {url}{dataset}, {str(e)}")
Пример #5
0
    def set_values_to_db(self):
        json_files = self.get_json_files()
        for json_file in json_files:
            print_green(json_file)
            with open(json_file, buffering=1000, encoding='iso-8859-1') as f:
                j = json.load(f)
                print_blue(j)
                participants = j["participants"]
                for index, message in enumerate(reversed(j["messages"])):
                    sender_name = self.convert_encoding(message["sender_name"])
                    timestamp_ms = message["timestamp_ms"]
                    message_type = self.convert_encoding(message["type"])
                    sender_message_name = self.convert_encoding(
                        participants[0]['name'])
                    if sender_message_name != "Konrad Uciechowski":
                        guid = f"{index}_{sender_message_name}_{timestamp_ms}"
                    else:
                        guid = f"{index}_facebook_user_{timestamp_ms}"

                    print_cyan(guid)

                    try:
                        content = self.convert_encoding(message["content"])
                    except Exception as e:
                        print_yellow(f"cannot get content, {e}")
                        content = ""

                    try:
                        photos = message["photos"]
                        photos_list = list()
                        for photo in photos:
                            photo_uri = photo["uri"]
                            photos_list.append(photo_uri)
                        photo_str = ', '.join(
                            [str(elem) for elem in photos_list])
                    except Exception as e:
                        print_yellow(f"cannot get photos, {e}")
                        photo_str = ""

                    exists = self.__find_message(guid)
                    if exists:
                        self.__update_message(guid, sender_name, timestamp_ms,
                                              message_type, content, photo_str)
                    else:
                        self.__set_message(guid, sender_name, timestamp_ms,
                                           message_type, content, photo_str)
Пример #6
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         with open(json_file, buffering=1000, encoding='iso-8859-1') as f:
             j = json.load(f)
             print_blue(j)
             conversations = j["conversations"]
             for conversation in conversations:
                 participant_data = conversation["conversation"][
                     "conversation"]["participant_data"]
                 participants = self.get_participants(participant_data)
                 print_magenta(participants)
                 events = conversation["events"]
                 for event in events:
                     conversation_id = event["conversation_id"]["id"]
                     sender_id = event["sender_id"]["gaia_id"]
                     fallback_name = participants[sender_id]
                     timestamp = event["timestamp"]
                     event_id = event["event_id"]
                     try:
                         segment = event["chat_message"]["message_content"][
                             "segment"][0]
                         event_type = segment["type"]
                         text = self.convert_encoding(segment["text"])
                     except Exception as e:
                         print_yellow(
                             f"cannot find segment in {event}, {e}")
                     print_green(
                         f"conversation_id => {conversation_id}, sender_id => {sender_id}, fallback_name => {fallback_name}, timesttamp => {timestamp}, event_id => {event_id}, event_type => {event_type}, text => {text}"
                     )
                     message = self.__find_message(event_id)
                     if message:
                         self.__update_message(event_id, text, sender_id,
                                               fallback_name, timestamp,
                                               conversation_id, event_type)
                     else:
                         self.__set_message(event_id, text, sender_id,
                                            fallback_name, timestamp,
                                            conversation_id, event_type)
Пример #7
0
 def set_values_to_db(self):
     json_files = self.get_json_files()
     for json_file in json_files:
         print_green(json_file)
         dataset = pd.read_csv(json_file, delimiter='\t')
         for index, data in dataset.iterrows():
             print_blue(data)
             question_id = data["QuestionID"]
             question = data["Question"]
             document_id = data["DocumentID"]
             document_title = data["DocumentTitle"]
             sentence_id = data["SentenceID"]
             sentence = data["Sentence"]
             label = data["Label"]
             print_green(
                 f"question_id => {question_id}, question => {question}, document_id => {document_id}, document_title => {document_title}, sentence_id => {sentence_id}, sentence => {sentence}, label => {label}")
             message = self.__find_message(document_id)
             if message:
                 self.__update_message(question_id, question, document_id, document_title,
                                       sentence_id, sentence, label)
             else:
                 self.__set_message(question_id, question, document_id, document_title,
                                    sentence_id, sentence, label)
Пример #8
0
    def set_values_to_db(self):
        row_counter = 0
        paired_rows = 0
        f_list = self._get_dir_files(self.destination_folder)
        dir_path = Path(__file__).parent.parent.parent.parent
        for f_name in f_list:
            if f_name.suffix == '.json':
                file = f"{dir_path}/{f_name}"
                print_blue(file)

                with open(file, buffering=1000) as f:
                    for data in f:
                        element = json.loads(data)
                        # for element in data:
                        row_counter += 1
                        print_blue(element)
                        parent_id = element['parent_id']
                        body = self.__format_data(element['body'])
                        created_utc = element['created_utc']
                        score = element['score']
                        try:
                            comment_id = element['name']
                        except Exception as e:
                            print_yellow(
                                f"comment id by name do not exists, {e}")
                        try:
                            comment_id = element['id']
                        except Exception as e:
                            print_yellow(
                                f"comment id by id do not exists, {e}")
                        subreddit = element['subreddit']
                        parent_data = self.__find_parent(parent_id)
                        print_green(
                            f"parent_id => {parent_id}, body => {body}, created_utc => {created_utc}, comment_id => {comment_id}, subreddit => {subreddit}, parent_data => {parent_data}"
                        )
                        if score >= 2:
                            comment_score = self.__find_score(parent_id)
                            if comment_score:
                                if score > comment_score:
                                    if self.__acceptable(body):
                                        self.insert_or_replace_comment(
                                            comment_id, parent_id, parent_data,
                                            body, subreddit, created_utc,
                                            score)
                            else:
                                if self.__acceptable(body):
                                    if parent_data:
                                        self.insert_parent(
                                            True, comment_id, parent_id,
                                            parent_data, body, subreddit,
                                            created_utc, score)
                                        paired_rows += 1
                                    else:
                                        self.insert_parent(
                                            False, comment_id, parent_id, None,
                                            body, subreddit, created_utc,
                                            score)
                        self.display_rows(row_counter, data, paired_rows)
                        self.clean_rows(row_counter, data)
            else:
                print_red(f"file of name {f_name} is not a json file")
Пример #9
0
 def get_encoding_type(self, file):
     print_blue(file)
     with open(file, 'rb') as f:
         rawdata = f.read()
     return detect(rawdata)['encoding']