def get_corpus(self): SKIP_LIST = [] ## filter(None, CLI_ARGS.skiplist.split(",")) ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] wav_root_dir = os.path.join(self.origin_data_path, 'it_IT') # Get audiofile path and transcript for each sentence in tsv samples = [] glob_dir = os.path.join(wav_root_dir, "**/metadata.csv") for record in glob(glob_dir, recursive=True): if any(map(lambda sk: sk in record, SKIP_LIST)): continue enc = encoding_from_path(record) with open(record, "r", encoding=enc) as rec: for re in rec.readlines(): re = re.strip().split("|") audio = os.path.join(os.path.dirname(record), "wavs", re[0] + ".wav") transcript = re[2] samples.append((audio, transcript)) ##append data manifest utterances[audio] = transcript audios.append(audio) ##collect corpus corpus = Corpus(utterances, audios) ################# ## evalita2009 have clips WAV 16000Hz - 1 chnl ## not require resample corpus.make_wav_resample = False return corpus
def pankong(re): if len(re) == 0: re = '' else: re = re[0] re = re.strip() return re
def get_corpus(self): SKIP_LIST = [] ## filter(None, CLI_ARGS.skiplist.split(",")) ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] fixed_token = {} wav_root_dir = os.path.join(self.origin_data_path,'it_IT') bad_examples = self.get_bad_examples() # Get audiofile path and transcript for each sentence in tsv glob_dir = os.path.join(wav_root_dir, "**/metadata.csv") for record in glob(glob_dir, recursive=True): if any( map(lambda sk: sk in record, SKIP_LIST) ): continue enc = encoding_from_path(record) with open(record, "r",encoding=enc) as rec: for re in rec.readlines(): re = re.strip().split("|") filename = re[0] ##filter bad examples (https://github.com/MozillaItalia/DeepSpeech-Italian-Model/issues/124#issuecomment-798613031) if(filename in bad_examples): continue audio = os.path.join(os.path.dirname(record), "wavs", filename + ".wav") transcript_source = re[1] transcript = re[2] ##in MLS normalization of character '’' is wrong in transcription normalization transcript = fix_apostrophe(transcript_source,transcript,fixed_token) ##append data manifest utterances[audio] = transcript audios.append(audio) ##collect corpus corpus = Corpus(utterances,audios) ################# ## evalita2009 have clips WAV 16000Hz - 1 chnl ## not require resample corpus.make_wav_resample = True ##self.save_wrong_token_dictionary(fixed_token) return corpus
def find_end(f_name): f = open(f_name, 'r') k = f.readlines() new_lst = [] for i in range(0, len(k)): if "<kw name=" in k[i]: re = k[i].strip('<kw name="').split('" library=')[0] start = True # print(type(re), len(re), re) if 'End Web Test' not in re: new_lst.append(re.strip('>\n')) else: break return new_lst
def search_in_subs(bookdir, subsdir, val=None): with open("$PersonalReport.csv", "r", encoding="utf-8") as file: data = file.read().splitlines() refdict = {} for d in data: k, _, re = d.split(",", 2) refdict[k] = re.strip().replace(", ", ",").replace(".txt", "") if val is None: print("Top 20 Words:") for i, item in enumerate(data[:20], 1): print(i, ". " + item, sep="") print("specify a word: ") val = input() if val in refdict: for i, item in enumerate(refdict[val].split(","), 1): print(i, ". " + item, sep="") print("Choose the file") choic = input() sel = refdict[val].split(",")[int(choic) - 1] elif val.isnumeric(): val = data[int(val) - 1].split(",")[0] for i, item in enumerate(refdict[val].split(","), 1): print(i, ". " + item, sep="") print("Choose the file") choic = input() # sel = refdict[data[val-1].split(",")[0]].split(",")[int(choic) - 1] sel = refdict[val].split(",")[int(choic) - 1] print(sel) else: print("not found in the report") return print(f"{subsdir}/{sel}") sublogger = print_matching_sub_lines(subsdir, selection=sel, value=val) sublogger.close() more_choic = input("Do you want to search in another file? ") if more_choic == "": return if more_choic[0].lower() == "y": search_in_subs(bookdir, subsdir, val=str(val))
def _maybe_convert_sets(target_dir, extracted_data): extracted_dir = path.join(target_dir, extracted_data) # override existing CSV with normalized one target_csv_template = os.path.join(target_dir, ARCHIVE_DIR_NAME, ARCHIVE_NAME.replace('.tgz', '_{}.csv')) if os.path.isfile(target_csv_template): return wav_root_dir = os.path.join(extracted_dir) # Get audiofile path and transcript for each sentence in tsv samples = [] glob_dir = os.path.join(wav_root_dir, '**/metadata.csv') for record in glob(glob_dir, recursive=True): for sk in SKIP_LIST: if not (sk in record): with open(record, 'r') as rec: for re in rec.readlines(): re = re.strip().split('|') audio = os.path.join(os.path.dirname(record), 'wavs', re[0] + '.wav') transcript = re[2] samples.append((audio, transcript)) # Keep track of how many samples are good vs. problematic counter = {'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0, 'total_time': 0} lock = RLock() num_samples = len(samples) rows = [] def one_sample(sample): """ Take a audio file, and optionally convert it to 16kHz WAV """ wav_filename = sample[0] file_size = -1 frames = 0 if path.exists(wav_filename): file_size = path.getsize(wav_filename) frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) label = label_filter(sample[1]) with lock: if file_size == -1: # Excluding samples that failed upon conversion counter['failed'] += 1 elif label is None: # Excluding samples that failed on label validation counter['invalid_label'] += 1 elif int(frames/SAMPLE_RATE*1000/15/2) < len(str(label)): # Excluding samples that are too short to fit the transcript counter['too_short'] += 1 elif frames/SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter['too_long'] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, label)) counter['all'] += 1 counter['total_time'] += frames print("Importing WAV files...") pool = Pool(cpu_count()) bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1): bar.update(i) bar.update(num_samples) pool.close() pool.join() with open(target_csv_template.format('train'), 'w') as train_csv_file: # 80% with open(target_csv_template.format('dev'), 'w') as dev_csv_file: # 10% with open(target_csv_template.format('test'), 'w') as test_csv_file: # 10% train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES) train_writer.writeheader() dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES) dev_writer.writeheader() test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES) test_writer.writeheader() for i, item in enumerate(rows): transcript = validate_label(item[2]) if not transcript: continue wav_filename = item[0] i_mod = i % 10 if i_mod == 0: writer = test_writer elif i_mod == 1: writer = dev_writer else: writer = train_writer writer.writerow(dict( wav_filename=wav_filename, wav_filesize=os.path.getsize(wav_filename), transcript=transcript, )) print('Imported %d samples.' % (counter['all'] - counter['failed'] - counter['too_short'] - counter['too_long'])) if counter['failed'] > 0: print('Skipped %d samples that failed upon conversion.' % counter['failed']) if counter['invalid_label'] > 0: print('Skipped %d samples that failed on transcript validation.' % counter['invalid_label']) if counter['too_short'] > 0: print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) if counter['too_long'] > 0: print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS)) print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE))
def get_section_subtitle(soup): section_heading = soup.find('p', {'class': 'sectionTitle title'}) section_subtitle = [re.strip() for re in section_heading if not re.name and re.strip()][0] return section_subtitle
def parse(self, response): data = response.css('script[type="text/javascript"] ::text').extract() data2 = data[0].replace('\n', '').replace(' ', '').replace( "null", "None").replace("true", "True").replace("false", "False") data3 = data2.replace('var off_data_data =', '').replace( ';// Instantiate API implementationvar off_data_tool = new inz.tools.OFFTool(off_data_data, \'/@@off_ajax\', {"filtersTitle": "Use this tool to find relevant fees and receiving centre information for a visa or employer scheme", "filtersCallToActionLabel": "View fees & Receiving Centre"});', '') data1 = ast.literal_eval(data3) # data=json.loads(data4) citizenship_countries = data1[ "citizenship_countries"] # data format {'label': 'Zimbabwe', 'value': 'ZWE'} len=207 it is useless visas_and_schemes = data1[ "visas_and_schemes"] # 'productSets': [{ id title residence_countries = data1[ "residence_countries"] # {'label': 'Kiribati', 'regions': None, 'value': 'KIR'} 247 countries = [] for data in citizenship_countries: countries.append(data["value"]) for data in residence_countries: country = data["label"] # label': 'Zimbabwe', code = data["value"] # value': 'KIR region = data["regions"] # 'regions': None, if code in countries: for datadata in visas_and_schemes: uid = datadata["uid"] # 6047a8ec183e45909dc8ade7bd56bdaf title = datadata["title"] # visit or study productSets = datadata["productSets"] # list for product in productSets: citizenshipRestrictions = product[ 'citizenshipRestrictions'] # list or None groupName = product["groupName"].strip( ) # Paper submission selectionLabel = product["selectionLabel"] selectionValues = product["selectionValues"] selectionval = selectionValues if selectionValues == None: selectionval = '' #print product #print '\n-------' , selectionval ,'-----------------\n\n\n' value_list = [] for value1 in selectionval: value_list.append(value1["value"]) xx = 1 try: if code in citizenshipRestrictions: xx = 0 except: a = '' if region == None: if xx == 0 or citizenshipRestrictions == None: #print url1 if value_list == []: url1 = "https://www.immigration.govt.nz/@@off_ajax?uid=" + uid + "&citizenship=" + code + "&residenceCountry=" + code + "&residenceRegion=" + "&groupName=" + groupName.replace( ' ', '%20') + "&selectionValue=" yield scrapy.Request( url1, callback=self.parse_fee, meta={ 'country': country, 'name': title, 'region': '' }) else: for sel in value_list: url1 = "https://www.immigration.govt.nz/@@off_ajax?uid=" + uid + "&citizenship=" + code + "&residenceCountry=" + code + "&residenceRegion=" + "&groupName=" + groupName.replace( ' ', '%20') + "&selectionValue=" + sel yield scrapy.Request( url1, callback=self.parse_fee, meta={ 'country': country, 'name': title, 'region': '' }) else: for re in region: if xx == 0 or citizenshipRestrictions == None: #print url1 if value_list == []: url1 = "https://www.immigration.govt.nz/@@off_ajax?uid=" + uid + "&citizenship=" + code + "&residenceCountry=" + code + "&residenceRegion=" + re.strip( ).replace( ' ', '%20' ) + "&groupName=" + groupName.replace( ' ', '%20') + "&selectionValue=" yield scrapy.Request( url1, callback=self.parse_fee, meta={ 'country': country, 'name': title, 'region': re }) else: for sel in value_list: url1 = "https://www.immigration.govt.nz/@@off_ajax?uid=" + uid + "&citizenship=" + code + "&residenceCountry=" + code + "&residenceRegion=" + "&groupName=" + groupName.replace( ' ', '%20' ) + "&selectionValue=" + sel yield scrapy.Request( url1, callback=self.parse_fee, meta={ 'country': country, 'name': title, 'region': '' })
sql1 = 'INSERT INTO `people` (`person_id`, `name`, `surname`, `sex`, `bdate`, `posts`) VALUES (%c, %s, %s, %c, %c, %s)' sql2 = 'INSERT INTO `langs` (`lang_id`, `lang`) VALUES (%c, %s)' sql3 = 'INSERT INTO `pl` (`connection_id`, `person_id`, `lang_id`) VALUES (%c, %c, %c)' f = open('seshcha.csv', encoding = 'utf8').readlines() for line in f[1:]: info = line.split(';') nm = line[0] srnm = line[1] pId = line[2] sx = line[3] bdt = line[4] lngs = line[5] psts = line[6] lId = 0 cId = 0 lngs = lngs.split() cur.execute(sql1, (pId, nm, srnm, sx, bdt, psts)) for lng in lngs: lng = re.strip(',', lng) if lng not in lngBase: cur.execute(sql2, (lId, lng)) lId += 1 cur.execute(sql3, (cId, pId, lId)) cId += 1 connection.commit() cur.close() connection.close()