def get_corpus(self):
        SKIP_LIST = []  ## filter(None, CLI_ARGS.skiplist.split(","))
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []

        wav_root_dir = os.path.join(self.origin_data_path, 'it_IT')

        # Get audiofile path and transcript for each sentence in tsv
        samples = []
        glob_dir = os.path.join(wav_root_dir, "**/metadata.csv")
        for record in glob(glob_dir, recursive=True):
            if any(map(lambda sk: sk in record, SKIP_LIST)):
                continue

            enc = encoding_from_path(record)
            with open(record, "r", encoding=enc) as rec:
                for re in rec.readlines():
                    re = re.strip().split("|")
                    audio = os.path.join(os.path.dirname(record), "wavs",
                                         re[0] + ".wav")
                    transcript = re[2]
                    samples.append((audio, transcript))
                    ##append data manifest
                    utterances[audio] = transcript
                    audios.append(audio)

        ##collect corpus
        corpus = Corpus(utterances, audios)
        #################
        ## evalita2009 have clips WAV 16000Hz - 1 chnl
        ## not require resample
        corpus.make_wav_resample = False
        return corpus
Exemplo n.º 2
0
def pankong(re):
    if len(re) == 0:
        re = ''
    else:
        re = re[0]
        re = re.strip()
    return re
    def get_corpus(self):
        SKIP_LIST = [] ## filter(None, CLI_ARGS.skiplist.split(","))
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []

        fixed_token = {}
        wav_root_dir = os.path.join(self.origin_data_path,'it_IT')

        bad_examples = self.get_bad_examples()

        # Get audiofile path and transcript for each sentence in tsv
        glob_dir = os.path.join(wav_root_dir, "**/metadata.csv")
        for record in glob(glob_dir, recursive=True):
            if any(
                map(lambda sk: sk in record, SKIP_LIST)
            ):
                continue

            enc = encoding_from_path(record)
            with open(record, "r",encoding=enc) as rec:
                for re in rec.readlines():
                    re = re.strip().split("|")

                    filename = re[0]
                    ##filter bad examples (https://github.com/MozillaItalia/DeepSpeech-Italian-Model/issues/124#issuecomment-798613031)
                    if(filename in bad_examples):
                        continue
                    audio = os.path.join(os.path.dirname(record), "wavs", filename + ".wav")
                    transcript_source = re[1]
                    transcript = re[2]
                    ##in MLS normalization of character '’'  is  wrong in transcription normalization
                    transcript =  fix_apostrophe(transcript_source,transcript,fixed_token)    

                    ##append data manifest
                    utterances[audio] = transcript  
                    audios.append(audio)

        ##collect corpus
        corpus = Corpus(utterances,audios)
        #################
        ## evalita2009 have clips WAV 16000Hz - 1 chnl
        ## not require resample
        corpus.make_wav_resample = True

        ##self.save_wrong_token_dictionary(fixed_token)


        return corpus
Exemplo n.º 4
0
def find_end(f_name):
    f = open(f_name, 'r')
    k = f.readlines()
    new_lst = []
    for i in range(0, len(k)):
        if "<kw name=" in k[i]:
            re = k[i].strip('<kw name="').split('" library=')[0]
            start = True
            # print(type(re), len(re), re)
            if 'End Web Test' not in re:
                new_lst.append(re.strip('>\n'))
            else:
                break
    return new_lst
Exemplo n.º 5
0
def search_in_subs(bookdir, subsdir, val=None):
    with open("$PersonalReport.csv", "r", encoding="utf-8") as file:
        data = file.read().splitlines()
    refdict = {}
    for d in data:
        k, _, re = d.split(",", 2)
        refdict[k] = re.strip().replace(", ", ",").replace(".txt", "")
    if val is None:
        print("Top 20 Words:")
        for i, item in enumerate(data[:20], 1):
            print(i, ". " + item, sep="")
        print("specify a word: ")
        val = input()
    if val in refdict:
        for i, item in enumerate(refdict[val].split(","), 1):
            print(i, ". " + item, sep="")
        print("Choose the file")
        choic = input()
        sel = refdict[val].split(",")[int(choic) - 1]
    elif val.isnumeric():
        val = data[int(val) - 1].split(",")[0]
        for i, item in enumerate(refdict[val].split(","), 1):
            print(i, ". " + item, sep="")
        print("Choose the file")
        choic = input()
        # sel = refdict[data[val-1].split(",")[0]].split(",")[int(choic) - 1]
        sel = refdict[val].split(",")[int(choic) - 1]
        print(sel)
    else:
        print("not found in the report")
        return
    print(f"{subsdir}/{sel}")
    sublogger = print_matching_sub_lines(subsdir, selection=sel, value=val)
    sublogger.close()
    more_choic = input("Do you want to search in another file? ")
    if more_choic == "":
        return
    if more_choic[0].lower() == "y":
        search_in_subs(bookdir, subsdir, val=str(val))
Exemplo n.º 6
0
def _maybe_convert_sets(target_dir, extracted_data):
    extracted_dir = path.join(target_dir, extracted_data)
    # override existing CSV with normalized one
    target_csv_template = os.path.join(target_dir, ARCHIVE_DIR_NAME, ARCHIVE_NAME.replace('.tgz', '_{}.csv'))
    if os.path.isfile(target_csv_template):
        return

    wav_root_dir = os.path.join(extracted_dir)

    # Get audiofile path and transcript for each sentence in tsv
    samples = []
    glob_dir = os.path.join(wav_root_dir, '**/metadata.csv')
    for record in glob(glob_dir, recursive=True):
        for sk in SKIP_LIST:
            if not (sk in record):
                with open(record, 'r') as rec:
                    for re in rec.readlines():
                        re = re.strip().split('|')
                        audio = os.path.join(os.path.dirname(record), 'wavs', re[0] + '.wav')
                        transcript = re[2]
                        samples.append((audio, transcript))

    # Keep track of how many samples are good vs. problematic
    counter = {'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0, 'total_time': 0}
    lock = RLock()
    num_samples = len(samples)
    rows = []

    def one_sample(sample):
        """ Take a audio file, and optionally convert it to 16kHz WAV """
        wav_filename = sample[0]
        file_size = -1
        frames = 0
        if path.exists(wav_filename):
            file_size = path.getsize(wav_filename)
            frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
        label = label_filter(sample[1])
        with lock:
            if file_size == -1:
                # Excluding samples that failed upon conversion
                counter['failed'] += 1
            elif label is None:
                # Excluding samples that failed on label validation
                counter['invalid_label'] += 1
            elif int(frames/SAMPLE_RATE*1000/15/2) < len(str(label)):
                # Excluding samples that are too short to fit the transcript
                counter['too_short'] += 1
            elif frames/SAMPLE_RATE > MAX_SECS:
                # Excluding very long samples to keep a reasonable batch-size
                counter['too_long'] += 1
            else:
                # This one is good - keep it for the target CSV
                rows.append((wav_filename, file_size, label))
            counter['all'] += 1
            counter['total_time'] += frames

    print("Importing WAV files...")
    pool = Pool(cpu_count())
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
        bar.update(i)
    bar.update(num_samples)
    pool.close()
    pool.join()

    with open(target_csv_template.format('train'), 'w') as train_csv_file:  # 80%
        with open(target_csv_template.format('dev'), 'w') as dev_csv_file:  # 10%
            with open(target_csv_template.format('test'), 'w') as test_csv_file:  # 10%
                train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
                train_writer.writeheader()
                dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
                dev_writer.writeheader()
                test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
                test_writer.writeheader()

                for i, item in enumerate(rows):
                    transcript = validate_label(item[2])
                    if not transcript:
                        continue
                    wav_filename = item[0]
                    i_mod = i % 10
                    if i_mod == 0:
                        writer = test_writer
                    elif i_mod == 1:
                        writer = dev_writer
                    else:
                        writer = train_writer
                    writer.writerow(dict(
                        wav_filename=wav_filename,
                        wav_filesize=os.path.getsize(wav_filename),
                        transcript=transcript,
                    ))

    print('Imported %d samples.' % (counter['all'] - counter['failed'] - counter['too_short'] - counter['too_long']))
    if counter['failed'] > 0:
        print('Skipped %d samples that failed upon conversion.' % counter['failed'])
    if counter['invalid_label'] > 0:
        print('Skipped %d samples that failed on transcript validation.' % counter['invalid_label'])
    if counter['too_short'] > 0:
        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
    if counter['too_long'] > 0:
        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
    print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE))
def get_section_subtitle(soup):
    section_heading = soup.find('p', {'class': 'sectionTitle title'})
    section_subtitle = [re.strip() for re in section_heading if not re.name and re.strip()][0]
    return section_subtitle
    def parse(self, response):

        data = response.css('script[type="text/javascript"] ::text').extract()
        data2 = data[0].replace('\n', '').replace('  ', '').replace(
            "null", "None").replace("true", "True").replace("false", "False")
        data3 = data2.replace('var off_data_data =', '').replace(
            ';// Instantiate API implementationvar off_data_tool = new inz.tools.OFFTool(off_data_data, \'/@@off_ajax\', {"filtersTitle": "Use this tool to find relevant fees and receiving centre information for a visa or employer scheme", "filtersCallToActionLabel": "View fees & Receiving Centre"});',
            '')
        data1 = ast.literal_eval(data3)  # data=json.loads(data4)
        citizenship_countries = data1[
            "citizenship_countries"]  #  data  format  {'label': 'Zimbabwe', 'value': 'ZWE'}  len=207  it is useless
        visas_and_schemes = data1[
            "visas_and_schemes"]  # 'productSets': [{  id title
        residence_countries = data1[
            "residence_countries"]  # {'label': 'Kiribati', 'regions': None, 'value': 'KIR'}  247
        countries = []

        for data in citizenship_countries:
            countries.append(data["value"])

        for data in residence_countries:
            country = data["label"]  # label': 'Zimbabwe',
            code = data["value"]  # value': 'KIR
            region = data["regions"]  # 'regions': None,

            if code in countries:
                for datadata in visas_and_schemes:
                    uid = datadata["uid"]  # 6047a8ec183e45909dc8ade7bd56bdaf
                    title = datadata["title"]  #  visit or study
                    productSets = datadata["productSets"]  #  list

                    for product in productSets:
                        citizenshipRestrictions = product[
                            'citizenshipRestrictions']  # list or None
                        groupName = product["groupName"].strip(
                        )  # Paper submission
                        selectionLabel = product["selectionLabel"]
                        selectionValues = product["selectionValues"]
                        selectionval = selectionValues
                        if selectionValues == None:
                            selectionval = ''
                        #print product
                        #print '\n-------' , selectionval ,'-----------------\n\n\n'
                        value_list = []
                        for value1 in selectionval:
                            value_list.append(value1["value"])

                        xx = 1
                        try:
                            if code in citizenshipRestrictions:
                                xx = 0
                        except:
                            a = ''

                        if region == None:
                            if xx == 0 or citizenshipRestrictions == None:

                                #print url1
                                if value_list == []:
                                    url1 = "https://www.immigration.govt.nz/@@off_ajax?uid=" + uid + "&citizenship=" + code + "&residenceCountry=" + code + "&residenceRegion=" + "&groupName=" + groupName.replace(
                                        ' ', '%20') + "&selectionValue="
                                    yield scrapy.Request(
                                        url1,
                                        callback=self.parse_fee,
                                        meta={
                                            'country': country,
                                            'name': title,
                                            'region': ''
                                        })
                                else:
                                    for sel in value_list:
                                        url1 = "https://www.immigration.govt.nz/@@off_ajax?uid=" + uid + "&citizenship=" + code + "&residenceCountry=" + code + "&residenceRegion=" + "&groupName=" + groupName.replace(
                                            ' ',
                                            '%20') + "&selectionValue=" + sel
                                        yield scrapy.Request(
                                            url1,
                                            callback=self.parse_fee,
                                            meta={
                                                'country': country,
                                                'name': title,
                                                'region': ''
                                            })

                        else:
                            for re in region:
                                if xx == 0 or citizenshipRestrictions == None:

                                    #print url1
                                    if value_list == []:
                                        url1 = "https://www.immigration.govt.nz/@@off_ajax?uid=" + uid + "&citizenship=" + code + "&residenceCountry=" + code + "&residenceRegion=" + re.strip(
                                        ).replace(
                                            ' ', '%20'
                                        ) + "&groupName=" + groupName.replace(
                                            ' ', '%20') + "&selectionValue="
                                        yield scrapy.Request(
                                            url1,
                                            callback=self.parse_fee,
                                            meta={
                                                'country': country,
                                                'name': title,
                                                'region': re
                                            })
                                    else:
                                        for sel in value_list:
                                            url1 = "https://www.immigration.govt.nz/@@off_ajax?uid=" + uid + "&citizenship=" + code + "&residenceCountry=" + code + "&residenceRegion=" + "&groupName=" + groupName.replace(
                                                ' ', '%20'
                                            ) + "&selectionValue=" + sel
                                            yield scrapy.Request(
                                                url1,
                                                callback=self.parse_fee,
                                                meta={
                                                    'country': country,
                                                    'name': title,
                                                    'region': ''
                                                })
    
sql1 = 'INSERT INTO `people` (`person_id`, `name`, `surname`, `sex`, `bdate`, `posts`) VALUES (%c, %s, %s, %c, %c, %s)'
sql2 = 'INSERT INTO `langs` (`lang_id`, `lang`) VALUES (%c, %s)'
sql3 = 'INSERT INTO `pl` (`connection_id`, `person_id`, `lang_id`) VALUES (%c, %c, %c)'

f = open('seshcha.csv', encoding = 'utf8').readlines()
for line in f[1:]:
    info = line.split(';')
    nm = line[0]
    srnm = line[1]
    pId = line[2]
    sx = line[3]
    bdt = line[4]
    lngs = line[5]
    psts = line[6]
    lId = 0
    cId = 0
    lngs = lngs.split()
    cur.execute(sql1, (pId, nm, srnm, sx, bdt, psts))
    for lng in lngs:
        lng = re.strip(',', lng)
        if lng not in lngBase:
            cur.execute(sql2, (lId, lng))
            lId += 1
        cur.execute(sql3, (cId, pId, lId))
        cId += 1

connection.commit()
cur.close()
connection.close()