Exemplo n.º 1
0
 def __init__(self, file: Path, out_filename, out=".", mode=1, padding=0):
     self.filename = file.stem
     self.padding = padding
     self.out = out
     self.out_filename = out_filename
     self.mode = mode
     self.tree = utils.parse_file(str(file))
     self.xml = PAGEBuilder(tree=self.tree, filename=self.out_filename)
Exemplo n.º 2
0
    def get_data(self):
        while len(self.files):
            names, batch_sp, batch_mfcc = [], [], []
            while len(names) < self.batch_size and len(self.files):
                fname = self.files.pop()

                data, spectrogram, features = parse_file(fname)
                names.append(fname.split('/')[-1])
                batch_sp.append(spectrogram)
                batch_mfcc.append(features)

            batch_mfcc, batch_sp = map(np.array, (batch_mfcc, batch_sp))
            yield names, [batch_mfcc, batch_sp]
Exemplo n.º 3
0
    def read_data(self):
        train_path = 'data/train/audio'
        labels = sorted(
            [x for x in os.listdir(train_path) if not x.endswith('_')])
        x_data_mfcc, x_data_sp, y_data = [], [], []

        for label in tqdm(labels, desc=f'{self.folds}'):
            filelist = glob(os.path.join(train_path, label) + '/*')
            filelist = (f for f in filelist
                        if self.assign_fold(f) in self.folds)

            idx = self.labels_idx.get(label, self.labels_idx['unknown'])

            for i, (r, s, f) in enumerate([parse_file(x) for x in filelist]):
                if self.subsample:
                    if idx == self.labels_idx['unknown'] and np.random.rand(
                    ) > .05:
                        continue
                x_data_mfcc.append(f)
                x_data_sp.append(s)
                y_data.append(idx)

        return map(np.array, (x_data_mfcc, x_data_sp, y_data))
Exemplo n.º 4
0
    print('Lenght of text after preprocessing: %d' % len(preprocess_text))
    vocabulary = list(set(preprocess_text))
    weighted_edges = calculated_weighted_edges_ver2(preprocess_text, 8)
    inout = calculated_inout(weighted_edges)
    scores = calculated_weighted_vertices(inout, weighted_edges, threshold=0.0)
    result = get_keys(scores, vocabulary, size=size)
    for key in result:
        print(key, end='\t-\t')
    print('\n')


from src import tf
from pyvi import ViTokenizer

if __name__ == '__main__':
    corpus = utils.parse_file('12.txt')
    f = open('b.txt', 'r')
    sentences = []
    for line in f:
        if line.strip():
            sentences.append(line.strip())
    text = ''
    for sent in sentences:
        text += str(sent) + ' '
    print(text)
    text = ViTokenizer.tokenize(text)
    print('--------------------------------------')
    get_keywords(text, 10)
    print('--------------------------------------')
    tf.keyword_extraction(text, 10)
Exemplo n.º 5
0
 def __init__(self, file: Path, out_filename, out="."):
     self.filename = file.stem
     self.out = out
     self.out_filename = out_filename
     self.tree = utils.parse_file(str(file))
     self.xml = PAGEBuilder(tree=self.tree, filename=self.out_filename)
Exemplo n.º 6
0
    text = dict(Counter(text))
    TF = {}
    for key in text:
        if key not in TF:
            TF[key] = text[key] / N
    return TF


def keyword_extraction(text, size=10):
    IDF = utils.load_dict('IDF')
    TF = tf(text)
    for key in TF:
        if key not in IDF:
            TF[key] = TF[key] * math.log10(2000 / 1)
        else:
            TF[key] = TF[key] * IDF[key]
    keys = sorted(TF.keys(), key=lambda x: TF[x], reverse=True)
    count = 0
    for key in keys:
        if count == size:
            break
        else:
            print(key, end='\t-\t')
            count += 1


if __name__ == '__main__':
    sentences = utils.parse_file('12.txt')
    print(sentences[10])
    print(pos_tag(sentences[10]))