def _get_authors(self, authors_path, num_partitions, n_partition):
     self.authors = np.load(authors_path)
     printer.print_progress(f'Got {len(self.authors)} unique authors')
     self.authors = np.array_split(self.authors, num_partitions)
     printer.print_warning(f'Number of parititions: {len(self.authors)}')
     self.authors = self.authors[n_partition]
     printer.print_progress(f'Got {len(self.authors)} unique authors')
示例#2
0
 def __init__(self, input_path, output_path):
     self.docs = pd.read_csv(input_path)
     self.docs = self.docs.fillna('0')
     self.comment_ids = self.docs['comment_id']
     self.docs = self.docs['comment_text'].tolist()
     self.tokenized_docs = []
     printer.print_progress('Text loaded')
     self.output_path = output_path
示例#3
0
 def start(self):
     printer.print_progress('Start collecting')
     with open(self.input_path, 'rb') as f:
         f.readline()  # move over header
         for _ in tqdm(range(self.number_of_lines)):
             offset = f.tell()
             line = f.readline().decode('utf-8')
             if not line:
                 break
             comment_id = self._parse_line(line)[0]
             self.output.append([comment_id, offset])
     self._save()
示例#4
0
 def make_prep(self):
     current_pos = 0
     resp = get_url_response(
         VkWorker.api_url, {
             "method": "groups.getMembers",
             "params": "group_id={id}&count=1".format(id=self.group_id),
             "token": self.token,
             "api_version": self.version
         })
     total_amount = extract_total(resp)
     printer = print_progress()
     printer.send(None)
     return current_pos, total_amount, printer
示例#5
0
 def _get_authors(self):
     comment_count_df = self.train_data_df.groupby('author_id')['comment_id'].count()
     comment_count_df = comment_count_df.reset_index()
     comment_count_df = comment_count_df[comment_count_df['comment_id'] >= self.min_comment_count]
     self.authors = comment_count_df['author_id'].unique()
     printer.print_progress(f'Got {len(self.authors)} unique authors')
示例#6
0
 def _prepare_data(self):
     self.train_data_df['timestamp'] = pd.to_datetime(self.train_data_df['timestamp'])
     printer.print_progress('Data prepared')
示例#7
0
 def _write_header(self):
     with open(self.output_path, 'w', encoding='utf-8  ') as f:
         writer = csv.writer(f)
         writer.writerow(['comment_id', 'comment_text'])
     printer.print_progress('Header added')
 def _prepare_data(self):
     self.data_df['timestamp'] = pd.to_datetime(self.data_df['timestamp'])
     self.data_df = self.data_df.sort_values(by='timestamp',
                                             ascending=False)
     printer.print_progress('Data prepared')
 def _prepare_data(self):
     self.train_data_df_top_comments['timestamp'] = pd.to_datetime(
         self.train_data_df_top_comments['timestamp'])
     self.train_data_df_top_comments = self.train_data_df_top_comments.sort_values(
         by='timestamp', ascending=False)
     printer.print_progress('Data prepared')
示例#10
0
 def make_tf_idf(self):
     printer.print_progress('Run TFIDF Model')
     self.model = TfidfModel(self.corpus, normalize=False)
     printer.print_success('Finished to create corpus')
示例#11
0
 def _load_offsets(self):
     offset_df = pd.read_csv(self.offset_path)
     self.offset_dict = offset_df.set_index(
         'comment_id').to_dict()['offset']
     printer.print_progress('Offsets loaded')