예제 #1
0
def prepare_data(raw_audio_dir, audio_outdir, csv_output):
    data_df = pd.DataFrame(columns=['audio', 'sub'])
    audio_list, sub_list = [], []
    sub_file = os.listdir(raw_audio_dir)
    # Interative over files
    for fn in sub_file:
        if fn.endswith(".wav"):
            # Audio and sub dir
            audio_path = os.path.join(raw_audio_dir, fn)
            audio_name, audio_format = fn.split('.')
            audio_sub_name = f'{audio_name}.srt'
            audio_sub_path = os.path.join(raw_audio_dir, audio_sub_name)
            #Read data
            audio = AudioSegment.from_file(audio_path)
            subs = pysrt.open(audio_sub_path, encoding='utf-8')
            for sub in subs:
                # Extract time
                start_ms = time_to_ms(sub.start)
                end_ms = time_to_ms(sub.end)
                audio_extract_name = f'{audio_outdir}{audio_name}_{start_ms}_{end_ms}.wav'
                text = normalize_text(str(sub.text))
                # CSV Columns
                audio_list.append(audio_extract_name)
                sub_list.append(text)
                # Extract to file
                extract = audio[start_ms:end_ms]
                extract.export(audio_extract_name, format="wav")
    # To csv file
    data_df['audio'], data_df['sub'] = audio_list, sub_list
    data_df.to_csv(csv_output, index=False)
예제 #2
0
def load_dataset(path):
    df = pd.read_excel(path)
    X = list(df["text"])
    X = [normalize_text(x) for x in X]
    y = df.drop("text", 1)
    columns = y.columns
    temp = y.apply(lambda item: item > 0)
    y = list(temp.apply(lambda item: list(columns[item.values]), axis=1))
    return X, y
예제 #3
0
def update_entities(engine, file_name):
    log.info("Updating entities reference sheet: %s", file_name)
    data = {}
    if os.path.exists(file_name):
        fh = open(file_name, 'rb')
        reader = csv.DictReader(fh)
        data = {}
        for d in reader:
            e = [(k, v.decode('utf-8')) for (k, v) in d.items()]
            e = dict(e)
            data[e['etlFingerPrint']] = e
        fh.close()
        print len(data)

    fh = open(file_name, 'wb')
    writer = None
    table = sl.get_table(engine, 'entity')
    for row in sl.all(engine, table):
        fp = row['etlFingerPrint']
        if fp is None:
            continue
        if not row.get('canonicalName'):
            row['canonicalName'] = row['etlFingerPrint']
        row['canonicalName'] = cleanCanonical(row['canonicalName'])
        entity = data.get(fp)
        if entity and entity.get('canonicalName') and \
            fp != entity.get('canonicalName'):
            #print entity.get('canonicalName').encode('utf-8')
            row['canonicalName'] = entity.get('canonicalName')
            out = row.copy()
            del out['id']
            sl.upsert(engine, table, out, ['etlFingerPrint'])
        cn = row['canonicalName']
        row['normalizedForm'] = normalize_text(cn)
        row['reverseForm'] = reverse_normalize(cn)
        if writer is None:
            writer = csv.DictWriter(fh, row.keys())
            writer.writerow(dict(zip(row.keys(), row.keys())))
        r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()]
        writer.writerow(dict(r))
    fh.close()
예제 #4
0
def cluster(engine):
    for row in sl.all(engine, sl.get_table(engine, 'entity')):
        print normalize_text(row['etlFingerPrint']).encode('utf-8')
예제 #5
0
def sentiment(text):
    text = normalize_text(text)
    X = x_transformer.transform([text])
    y_pred = estimator.predict(X)
    labels = y_transformer.inverse_transform(y_pred)[0]
    return labels
예제 #6
0
파일: main.py 프로젝트: cash/struckdc_viz
import geopy

with open('tweets.json') as f:
    data = json.load(f)

tweets = [{key: tweet[key] for key in ['text', 'created_at']} for tweet in data]

print("Starting with " + str(len(tweets)) + " tweets")

classifier = TweetClassifier()
tweets = [x for x in tweets if classifier.classify(x['text'])]
print("After removing tweets not starting with #ddd: " + str(len(tweets)))

extractor = AddressExtractor()
for tweet in tweets:
    tweet['text'] = normalize_text(tweet['text'])
    # cheap classifier for pedestrian versus cyclist
    tweet['cyclist'] = bool(re.search(r'(?i)cycl', tweet['text']))
    tweet['address'] = extractor.extract(tweet['text'])

geolocator = geopy.geocoders.GoogleV3()
for tweet in tweets:
    # throttling
    time.sleep(0.3)

    if tweet['address'] is None:
        continue

    try:
        address, (latitude, longitude) = geolocator.geocode(tweet['address'] + " Washington DC")
    except Exception as e: