def __init__(self, file: Path, out_filename, out=".", mode=1, padding=0): self.filename = file.stem self.padding = padding self.out = out self.out_filename = out_filename self.mode = mode self.tree = utils.parse_file(str(file)) self.xml = PAGEBuilder(tree=self.tree, filename=self.out_filename)
def get_data(self): while len(self.files): names, batch_sp, batch_mfcc = [], [], [] while len(names) < self.batch_size and len(self.files): fname = self.files.pop() data, spectrogram, features = parse_file(fname) names.append(fname.split('/')[-1]) batch_sp.append(spectrogram) batch_mfcc.append(features) batch_mfcc, batch_sp = map(np.array, (batch_mfcc, batch_sp)) yield names, [batch_mfcc, batch_sp]
def read_data(self): train_path = 'data/train/audio' labels = sorted( [x for x in os.listdir(train_path) if not x.endswith('_')]) x_data_mfcc, x_data_sp, y_data = [], [], [] for label in tqdm(labels, desc=f'{self.folds}'): filelist = glob(os.path.join(train_path, label) + '/*') filelist = (f for f in filelist if self.assign_fold(f) in self.folds) idx = self.labels_idx.get(label, self.labels_idx['unknown']) for i, (r, s, f) in enumerate([parse_file(x) for x in filelist]): if self.subsample: if idx == self.labels_idx['unknown'] and np.random.rand( ) > .05: continue x_data_mfcc.append(f) x_data_sp.append(s) y_data.append(idx) return map(np.array, (x_data_mfcc, x_data_sp, y_data))
print('Lenght of text after preprocessing: %d' % len(preprocess_text)) vocabulary = list(set(preprocess_text)) weighted_edges = calculated_weighted_edges_ver2(preprocess_text, 8) inout = calculated_inout(weighted_edges) scores = calculated_weighted_vertices(inout, weighted_edges, threshold=0.0) result = get_keys(scores, vocabulary, size=size) for key in result: print(key, end='\t-\t') print('\n') from src import tf from pyvi import ViTokenizer if __name__ == '__main__': corpus = utils.parse_file('12.txt') f = open('b.txt', 'r') sentences = [] for line in f: if line.strip(): sentences.append(line.strip()) text = '' for sent in sentences: text += str(sent) + ' ' print(text) text = ViTokenizer.tokenize(text) print('--------------------------------------') get_keywords(text, 10) print('--------------------------------------') tf.keyword_extraction(text, 10)
def __init__(self, file: Path, out_filename, out="."): self.filename = file.stem self.out = out self.out_filename = out_filename self.tree = utils.parse_file(str(file)) self.xml = PAGEBuilder(tree=self.tree, filename=self.out_filename)
text = dict(Counter(text)) TF = {} for key in text: if key not in TF: TF[key] = text[key] / N return TF def keyword_extraction(text, size=10): IDF = utils.load_dict('IDF') TF = tf(text) for key in TF: if key not in IDF: TF[key] = TF[key] * math.log10(2000 / 1) else: TF[key] = TF[key] * IDF[key] keys = sorted(TF.keys(), key=lambda x: TF[x], reverse=True) count = 0 for key in keys: if count == size: break else: print(key, end='\t-\t') count += 1 if __name__ == '__main__': sentences = utils.parse_file('12.txt') print(sentences[10]) print(pos_tag(sentences[10]))