def prepare_data(raw_audio_dir, audio_outdir, csv_output): data_df = pd.DataFrame(columns=['audio', 'sub']) audio_list, sub_list = [], [] sub_file = os.listdir(raw_audio_dir) # Interative over files for fn in sub_file: if fn.endswith(".wav"): # Audio and sub dir audio_path = os.path.join(raw_audio_dir, fn) audio_name, audio_format = fn.split('.') audio_sub_name = f'{audio_name}.srt' audio_sub_path = os.path.join(raw_audio_dir, audio_sub_name) #Read data audio = AudioSegment.from_file(audio_path) subs = pysrt.open(audio_sub_path, encoding='utf-8') for sub in subs: # Extract time start_ms = time_to_ms(sub.start) end_ms = time_to_ms(sub.end) audio_extract_name = f'{audio_outdir}{audio_name}_{start_ms}_{end_ms}.wav' text = normalize_text(str(sub.text)) # CSV Columns audio_list.append(audio_extract_name) sub_list.append(text) # Extract to file extract = audio[start_ms:end_ms] extract.export(audio_extract_name, format="wav") # To csv file data_df['audio'], data_df['sub'] = audio_list, sub_list data_df.to_csv(csv_output, index=False)
def load_dataset(path): df = pd.read_excel(path) X = list(df["text"]) X = [normalize_text(x) for x in X] y = df.drop("text", 1) columns = y.columns temp = y.apply(lambda item: item > 0) y = list(temp.apply(lambda item: list(columns[item.values]), axis=1)) return X, y
def update_entities(engine, file_name): log.info("Updating entities reference sheet: %s", file_name) data = {} if os.path.exists(file_name): fh = open(file_name, 'rb') reader = csv.DictReader(fh) data = {} for d in reader: e = [(k, v.decode('utf-8')) for (k, v) in d.items()] e = dict(e) data[e['etlFingerPrint']] = e fh.close() print len(data) fh = open(file_name, 'wb') writer = None table = sl.get_table(engine, 'entity') for row in sl.all(engine, table): fp = row['etlFingerPrint'] if fp is None: continue if not row.get('canonicalName'): row['canonicalName'] = row['etlFingerPrint'] row['canonicalName'] = cleanCanonical(row['canonicalName']) entity = data.get(fp) if entity and entity.get('canonicalName') and \ fp != entity.get('canonicalName'): #print entity.get('canonicalName').encode('utf-8') row['canonicalName'] = entity.get('canonicalName') out = row.copy() del out['id'] sl.upsert(engine, table, out, ['etlFingerPrint']) cn = row['canonicalName'] row['normalizedForm'] = normalize_text(cn) row['reverseForm'] = reverse_normalize(cn) if writer is None: writer = csv.DictWriter(fh, row.keys()) writer.writerow(dict(zip(row.keys(), row.keys()))) r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()] writer.writerow(dict(r)) fh.close()
def cluster(engine): for row in sl.all(engine, sl.get_table(engine, 'entity')): print normalize_text(row['etlFingerPrint']).encode('utf-8')
def sentiment(text): text = normalize_text(text) X = x_transformer.transform([text]) y_pred = estimator.predict(X) labels = y_transformer.inverse_transform(y_pred)[0] return labels
import geopy with open('tweets.json') as f: data = json.load(f) tweets = [{key: tweet[key] for key in ['text', 'created_at']} for tweet in data] print("Starting with " + str(len(tweets)) + " tweets") classifier = TweetClassifier() tweets = [x for x in tweets if classifier.classify(x['text'])] print("After removing tweets not starting with #ddd: " + str(len(tweets))) extractor = AddressExtractor() for tweet in tweets: tweet['text'] = normalize_text(tweet['text']) # cheap classifier for pedestrian versus cyclist tweet['cyclist'] = bool(re.search(r'(?i)cycl', tweet['text'])) tweet['address'] = extractor.extract(tweet['text']) geolocator = geopy.geocoders.GoogleV3() for tweet in tweets: # throttling time.sleep(0.3) if tweet['address'] is None: continue try: address, (latitude, longitude) = geolocator.geocode(tweet['address'] + " Washington DC") except Exception as e: