Пример #1
0
 def vocabulary(self):
     env = Environment()
     file_voc = env.filename_vocabulary_csv()  #from vocabulary file
     file_dict = env.filename_dict_csv()  #from dictionary file
     try:
         df_voc = pd.read_csv(file_voc,
                              index_col='idcorpus',
                              encoding='utf-8')
     except:
         env.debug(1, ['Failed to read vocabulary file:', file_voc])
     else:
         env.debug(1, ['Read vocabulary OK:', file_voc])
     try:
         df_dict = pd.read_csv(file_dict,
                               index_col='idcorpus',
                               encoding='utf-8')
     except:
         env.debug(1, ['Failed to read dictionary file:', file_dict])
     else:
         env.debug(1, ['Read dictionary OK:', file_dict])
     #Concat
     df_res = pd.concat([df_voc, df_dict])
     df_res = df_res.drop_duplicates()
     #Apply patch words
     df_patch = pd.read_csv(env.filename_vocabulary_patch_csv(),
                            index_col='idcorpus',
                            encoding='utf-8')
     df_res = df_res.drop(df_res[df_res['word'].isin(
         df_patch['word'])].index,
                          axis=0)
     df_res = pd.concat([df_res, df_patch])
     #print(df_res[df_res['word'].isin(df_patch['word'])])
     df_res = df_res.reset_index(drop=True)
     df_res.index.name = 'idcorpus'
     #print(df_res)
     return df_res
Пример #2
0
 def dict_xml2csv(self, persistent=True, lines=10000):
     t_start = timer()
     env = Environment()
     dfgram = self.grammemes()
     filename_dict = env.filename_dict_xml()
     dfcols = ['word', 'gram', 'idgram']
     df_xml = pd.DataFrame(columns=dfcols)
     env.debug(
         1, ['CORPUS', 'Start to load dictionary from XML:', filename_dict])
     try:
         fp = io.open(filename_dict, mode="r", encoding="utf-8")
     except:
         env.debug(1, [
             'CORPUS', 'Failed to open dictionary file XML:', filename_dict
         ])
     else:
         number_lines = sum(1 for line in fp)
         fp.seek(0)
         t_end = timer()
         env.debug(1, [
             'CORPUS', 'File opened:', 'lines',
             '%s' % number_lines, 'time:',
             env.job_time(t_start, t_end)
         ])
         t_start = timer()
         step = number_lines // lines
         env.debug(1, [
             'CORPUS', 'Read dictionary:', filename_dict,
             'lines: %s step %s' % (lines, step)
         ])
         n_line = 0
         for i in range(0, number_lines):
             line = fp.readline()
             #print(line[5:10])
             if (line[5:10] == 'lemma') and (n_line == 0):
                 #print(line)
                 tree = ET.fromstring(line)
                 for elem in tree.iter('l'):
                     s_word = elem.attrib.get('t')
                     gram = ['', 0]
                     j = 0
                     for elem2 in elem.iter('g'):
                         gram[j] = elem2.attrib.get('v')
                         break
                     gram[1] = int(dfgram.index[dfgram['name'] ==
                                                gram[0]].tolist()[0])
                 #print(s_word,gram)
                 s = pd.Series(data=[s_word, gram[0], gram[1]],
                               index=dfcols)
                 df_xml = df_xml.append(s, ignore_index=True)
                 n_line += 1
             n_line += 1
             if n_line >= step:
                 n_line = 0
         fp.close()
         df_xml.index.name = 'idcorpus'
         t_end = timer()
         env.debug(1, [
             'CORPUS', 'Dictionary loaded:', 'time:',
             env.job_time(t_start, t_end)
         ])
         if persistent:
             filename_csv = env.filename_dict_csv()
             env.debug(1,
                       ['CORPUS', 'Write dictionary to CSV:', filename_csv])
             df_xml.to_csv(filename_csv, encoding='utf-8')
             env.debug(1, ['CORPUS', 'Dictionary saved:', filename_csv])
     return df_xml