示例#1
0
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0):
    ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, 'ntype_train.txt'))))
    clean_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt'))))
    noisy_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt'))))
    clean_keys = read_list(os.path.join(data_root, 'clean_train_key_list.txt'))
    noisy_keys = read_list(os.path.join(data_root, 'noisy_train_key_list.txt'))
    # upsampling clean keys to ratio * #noisy_keys
    clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio)
    # mix clean and noisy data
    keys = list(clean_keys) + list(noisy_keys)
    np.random.shuffle(keys)
    clean, noisy, ntype = [], [], []
    for k in keys:
        if k in clean_kv:
            clean.append(clean_kv[k])
            noisy.append('-1')
        else:
            clean.append('-1')
            noisy.append(noisy_kv[k])
        if k in ntype_kv:
            ntype.append(ntype_kv[k])
        else:
            ntype.append('-1')
    keys = [k + ' -1' for k in keys]
    write_list(keys, os.path.join(output_dir, 'mixed_train_images.txt'))
    write_list(clean, os.path.join(output_dir, 'mixed_train_label_clean.txt'))
    write_list(noisy, os.path.join(output_dir, 'mixed_train_label_noisy.txt'))
    write_list(ntype, os.path.join(output_dir, 'mixed_train_label_ntype.txt'))
示例#2
0
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0):
    ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, "ntype_train.txt"))))
    clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt"))))
    noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt"))))
    clean_keys = read_list(os.path.join(data_root, "clean_train_key_list.txt"))
    noisy_keys = read_list(os.path.join(data_root, "noisy_train_key_list.txt"))
    # upsampling clean keys to ratio * #noisy_keys
    clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio)
    # mix clean and noisy data
    keys = list(clean_keys) + list(noisy_keys)
    np.random.shuffle(keys)
    clean, noisy, ntype = [], [], []
    for k in keys:
        if k in clean_kv:
            clean.append(clean_kv[k])
            noisy.append("-1")
        else:
            clean.append("-1")
            noisy.append(noisy_kv[k])
        if k in ntype_kv:
            ntype.append(ntype_kv[k])
        else:
            ntype.append("-1")
    keys = [k + " -1" for k in keys]
    write_list(keys, os.path.join(output_dir, "mixed_train_images.txt"))
    write_list(clean, os.path.join(output_dir, "mixed_train_label_clean.txt"))
    write_list(noisy, os.path.join(output_dir, "mixed_train_label_noisy.txt"))
    write_list(ntype, os.path.join(output_dir, "mixed_train_label_ntype.txt"))
示例#3
0
def get_release_years(avatar_list_path: str, getchu_data_path: str) -> list:
    """
  statistics of dataset's release years.
  :param avatar_list_path:
  :param getchu_data_path:
  :return:
  """
    avatar_list = utils.read_list(avatar_list_path)
    getchu_data_list = utils.read_list(getchu_data_path)
    avatar_list = list(map(lambda each: int(each[0]), avatar_list))

    getchu_data_list = list(
        map(
            lambda each:
            (int(each[0]), int(re.findall('(\d+)-\d+-\d+', each[1])[-1])),
            getchu_data_list))
    years = [
        0 for i in range(
            0,
            np.max(np.array(list(map(lambda each: each[0], getchu_data_list))))
            + 1)
    ]
    statistics = [
        0 for i in range(
            0,
            np.max(np.array(list(map(lambda each: each[1], getchu_data_list))))
            + 1)
    ]
    for each in getchu_data_list:
        years[each[0]] = each[1]
    for each in avatar_list:
        statistics[years[each]] += 1
    print(statistics[1990:])
    return statistics
示例#4
0
def make_aux_ntype(data_root, output_dir):
    clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt"))))
    noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt"))))
    train_keys = set(read_list(os.path.join(data_root, "clean_train_key_list.txt")))
    val_keys = set(read_list(os.path.join(data_root, "clean_val_key_list.txt")))
    test_keys = set(read_list(os.path.join(data_root, "clean_test_key_list.txt")))
    noisy_keys = set(noisy_kv.keys())
    # compute and save matrix C
    keys = (train_keys | val_keys) & noisy_keys
    clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
    noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
    C = compute_matrix_c(clean_labels, noisy_labels)
    save_to_blobproto(C, os.path.join(output_dir, "matrix_c.binaryproto"))
    # make noise type (ntype)
    def _make(keys, token):
        clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
        noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
        lines = []
        alpha = 1.0 / (C.shape[0] - 1)
        for key, y, y_tilde in zip(keys, clean_labels, noisy_labels):
            if y == y_tilde:
                lines.append(key + " 0")
            elif alpha >= C[y_tilde][y]:
                lines.append(key + " 1")
            else:
                lines.append(key + " 2")
        np.random.shuffle(lines)
        output_file = os.path.join(output_dir, "ntype_{}.txt".format(token))
        write_list(lines, output_file)

    _make(train_keys & noisy_keys, "train")
    _make(val_keys & noisy_keys, "val")
    _make(test_keys & noisy_keys, "test")
示例#5
0
def LINK_ENTITIES(parsed_list, predicted, STAGE=True):
    """
    Two entities are considered to have a link if they appear in a range of two consecutive sentences.
    :return: a list of tuples
    """
    print("Start LINKS")
    if STAGE:
        people_links = []
        per_link = []
        location_links = []
        events = []

        for idx in tqdm(range(len(parsed_list))):
            per, loc, per_idx, loc_idx = get_ents_from_predicted(
                predicted[idx], parsed_list[idx])

            # CR per sentence
            if len(per_idx) > 1:
                ev = coref_events(parsed_list[idx], per_idx, loc_idx)
                if any(ev):
                    [x.append(idx) for x in ev]
                    events.append([x for x in ev])

            # PER + LOC LINKS per two sentences
            if idx < len(parsed_list) - 1:
                per2, loc2, per_idx2, loc_idx2 = get_ents_from_predicted(
                    predicted[idx + 1], parsed_list[idx + 1])
                per += per2
                loc += loc2
                per_idx += per_idx2
                loc_idx += loc_idx2
            per = list(set(per))
            loc = list(set(loc))

            # PEOPLE
            for a, b in combinations(per, 2):
                link_two_person(people_links, a, b)
                per_link.append([a, b, idx])

            # LOCATIONS
            if any(loc) and any(per):
                for l in loc:
                    for p in per:
                        if p != l:
                            location_links.append([l, p, idx])

        # POST PROCESS
        people_links = sorted([x for x in people_links if x[2] > 4],
                              key=lambda x: x[2],
                              reverse=True)
        events = [x[0] for x in events]
        # write_list('people_links', people_links)
        # write_list('location_links', location_links)
        # write_list('events', events)

    else:
        people_links = read_list('people_links')
        location_links = read_list('location_links')
    return people_links, location_links, events
示例#6
0
def get_short_edge_size(avatar_list_path: str, getchu_data_path: str) -> list:
  """
  statistics of dataset's shortest edge's size.
  :param avatar_list_path:
  :param getchu_data_path:
  :return:
  """
  avatar_list = utils.read_list(avatar_list_path)
  getchu_data_list = utils.read_list(getchu_data_path)
  avatar_list = list(map(lambda each: get_short_size(each[2].strip('\n')), avatar_list))
  statistics = [0 for i in range(0, np.max(np.array(list(map(lambda each: each, avatar_list))))+1)]
  for each in avatar_list:
    statistics[each] += 1
  print(statistics[42:])
示例#7
0
 def _make(token):
     keys = read_list(
         os.path.join(data_root, 'clean_{}_key_list.txt'.format(token)))
     lines = [k + ' ' + label_kv[k] for k in keys]
     np.random.shuffle(lines)
     output_file = os.path.join(output_dir, 'clean_{}.txt'.format(token))
     write_list(lines, output_file)
示例#8
0
文件: ksc.py 项目: RedHatOfficial/ksc
 def read_data(self, arch, releasedir, symvers):
     """
     Read both data files
     """
     self.matchdata, exists = read_list(arch, releasedir, self.verbose)
     self.total = read_total_list(symvers)
     return exists
示例#9
0
def normalize_list(people_links, STAGE=True):
    print("Start NORMALIZATION")
    if STAGE:

        normalized = people_links_norm(people_links)

        for a, b in combinations(
            [[i, x] for i, x in enumerate(normalized) if len(x) == 1], 2):
            if a[1][0] in b[1][0] or b[1][0] in a[1][0]:
                [normalized.remove(x) for x in normalized if x[0] == a[1][0]]
                [normalized.remove(x) for x in normalized if x[0] == b[1][0]]
                normalized.append([a[1][0], b[1][0]])
                normalized[-1] = list(set(normalized[-1]))

        # write_list('normalized', normalized)

    else:
        normalized = read_list('normalized')

    print("NER and NORMALIZATION finished:", len(normalized),
          "'person' entities found")

    new_list = []
    for idx, link in enumerate(people_links):
        for i, name in [[i, x] for i, x in enumerate(link) if i != 2]:
            a = [x[0] for x in normalized if name in x]
            if any(a):
                people_links[idx][i] = a[0]
        if link[0] != link[1]:
            new_list.append(link)
    people_links = new_list
    return people_links
示例#10
0
def make_aux_ntype(data_root, output_dir):
    clean_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt'))))
    noisy_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt'))))
    train_keys = set(
        read_list(os.path.join(data_root, 'clean_train_key_list.txt')))
    val_keys = set(read_list(os.path.join(data_root,
                                          'clean_val_key_list.txt')))
    test_keys = set(
        read_list(os.path.join(data_root, 'clean_test_key_list.txt')))
    noisy_keys = set(noisy_kv.keys())
    # compute and save matrix C
    keys = (train_keys | val_keys) & noisy_keys
    clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
    noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
    C = compute_matrix_c(clean_labels, noisy_labels)
    save_to_blobproto(C, os.path.join(output_dir, 'matrix_c.binaryproto'))

    # make noise type (ntype)
    def _make(keys, token):
        clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
        noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
        lines = []
        alpha = 1.0 / (C.shape[0] - 1)
        for key, y, y_tilde in zip(keys, clean_labels, noisy_labels):
            if y == y_tilde:
                lines.append(key + ' 0')
            elif alpha >= C[y_tilde][y]:
                lines.append(key + ' 1')
            else:
                lines.append(key + ' 2')
        np.random.shuffle(lines)
        output_file = os.path.join(output_dir, 'ntype_{}.txt'.format(token))
        write_list(lines, output_file)

    _make(train_keys & noisy_keys, 'train')
    _make(val_keys & noisy_keys, 'val')
    _make(test_keys & noisy_keys, 'test')
def make_data(files, noise_types, data_root):
    # noise types training and val
    merged = zip(files, noise_types)
    np.random.shuffle(merged)
    training = ['{} {}'.format(f, t) for f, t in merged[:8000]]
    test = ['{} {}'.format(f, t) for f, t in merged[8000:]]
    write_list(training, osp.join(data_root, 'ntype_train.txt'))
    write_list(test, osp.join(data_root, 'ntype_test.txt'))
    # noise types of mixed training images
    dic = defaultdict(lambda: -1)
    dic.update(dict(zip(files, noise_types)))
    files = read_list(osp.join(data_root, 'mixed_train_images.txt'))
    files = [f.split()[0] for f in files]
    noise_types = [dic[f] for f in files]
    write_list(noise_types, osp.join(data_root, 'mixed_train_label_ntype.txt'))
def make_data(files, noise_types, data_root):
    # noise types training and val
    merged = zip(files, noise_types)
    np.random.shuffle(merged)
    training = ['{} {}'.format(f, t) for f, t in merged[:8000]]
    test = ['{} {}'.format(f, t) for f, t in merged[8000:]]
    write_list(training, osp.join(data_root, 'ntype_train.txt'))
    write_list(test, osp.join(data_root, 'ntype_test.txt'))
    # noise types of mixed training images
    dic = defaultdict(lambda: -1)
    dic.update(dict(zip(files, noise_types)))
    files = read_list(osp.join(data_root, 'mixed_train_images.txt'))
    files = [f.split()[0] for f in files]
    noise_types = [dic[f] for f in files]
    write_list(noise_types, osp.join(data_root, 'mixed_train_label_ntype.txt'))
示例#13
0
    def __init__(self,
                 patient_info_dir,
                 merge_info_csv='merge_info.csv',
                 exclude_list='exclude_patient_sids.txt',
                 outcome_csv='new_outcomes.csv'):
        # patient_info_dir: directory containing the merge_info_csv, exclude_list, and outcome_csv
        # merge_info_csv: name of the csv containing list of edfs and timestamps for
        #   all patients
        # exclude_list: name of the txt file containing sids of patients to be excluded
        # outcome_csv: name of the csv contiaining patient outcomes

        merge_info_csv = os.path.join(patient_info_dir, merge_info_csv)
        exclude_list = os.path.join(patient_info_dir, exclude_list)
        outcome_csv = os.path.join(patient_info_dir, outcome_csv)

        self.merge_info_df = pd.read_csv(merge_info_csv)
        self.exclude_patients = utils.read_list(exclude_list)
        self.outcomes_df = pd.read_csv(outcome_csv, index_col=0)
示例#14
0
def rename_files(
    folder_path: Path = Argument(default='.',
                                 exists=True,
                                 file_okay=True,
                                 dir_okay=True,
                                 readable=True,
                                 resolve_path=True),
    list_path: Path = Argument(default='list.csv',
                               exists=True,
                               file_okay=True,
                               dir_okay=True,
                               readable=True,
                               resolve_path=True),
):
    list = read_list(list_path)
    for cp, dir, files in walk(folder_path):
        for file in files:
            if file in list:
                move(path.join(cp, file),
                     path.join(cp, list[file].replace('\n', '')))
示例#15
0
def get_tileset(tileset_name, index=-1, override_offset=-1):
    offsets = utils.read_table('scripts/res/meta_tileset_load_offsets.tbl')
    base_offset = 0

    if override_offset == -1:
        if index == -1:
            idx_tbl = utils.read_table('scripts/res/meta_tileset_index.tbl')
            hits = [idx for idx in idx_tbl if idx_tbl[idx] == tileset_name]
            if len(hits) != 1:
                raise f"Found more or less than one entry for {tileset_name}, provide an index if it appears more than once"
            index = hits[0]

        base_offset = (int(offsets[index], 16) // 0x10) & 0xFF
    else:
        base_offset = override_offset

    tbl = utils.read_list(f'scripts/res/tilesets/{tileset_name}.lst',
                          base_offset)
    tbl[0] = ' '
    return tbl
def visualize_ner(ydx):
    from preprocess import get_texts
    from config import spacy, FoundationTrilogy, displacy
    from utils import read_list
    from spacy.tokens import Span

    validation_idx = read_list('validation_dataset')
    sentences2 = get_texts(FoundationTrilogy)
    sentences = [sentences2[i] for i in validation_idx]
    nlp2 = spacy.load("en_core_web_sm", disable=['ner'])

    doc = nlp2(sentences[ydx])

    spans = []
    for sp in sents_pred_labels[ydx]:
        spans.append(Span(doc, int(sp['start_idx']), int(sp['end_idx']+1), label=sp['type']))

    doc.ents = spans
    colors = {"PER": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "LOC": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
    options = {"ents": ["PER", "LOC"], "colors": colors}
    displacy.serve(doc, style="ent", options=options)
示例#17
0
def fast_grab():
    global proxy_count
    proxies = []
    sites = read_list('sites.txt')

    for site in sites:
        print(f'[i]Site:{site}')
        try:
            proxies_from_site = parse_proxies(requests.get(site).text)
            if len(proxies_from_site) != 0:
                proxies += proxies_from_site
                proxy_count += len(proxies_from_site)
                print(
                    f'[+]Proxy from site:{str(len(proxies_from_site)).zfill(5)}, Total:{str(proxy_count).zfill(5)}'
                )
            else:
                print(f'[-]No proxy from: {site}')
        except SilentException as e:
            print(f'[-]Dead Site: {site}', e)

    return proxies
示例#18
0
    def __init__(self, **kwargs):
        super(SiameseDataLoader, self).__init__()
        self.input_args = kwargs
        self.image_size = kwargs.get('image_size')
        self.image_pairs_list = read_list(kwargs.get('data_list_path'))
        self.label_dir = kwargs.get('label_dir')
        self.rois_dir = kwargs.get('rois_dir')
        self.rois_siamese_dir = kwargs.get('rois_siamese_dir')
        self.multi_thread = kwargs.get('multi_thread', True)
        self.n_thread = kwargs.get('n_thread', 7)
        self.stop_word = kwargs.get('stop_word', '==STOP--')
        self.batch_size = kwargs.pop('batch_size', 10)
        self.mode = kwargs.pop('mode', 'train')
        self.data_num = len(self.image_pairs_list)
        self.current = 0
        self.worker_proc = None
        self._get_next(True)

        if self.multi_thread:
            self.stop_flag = mp.Value('b', False)
            self.result_queue = mp.Queue(maxsize=self.batch_size * 30)
            self.data_queue = mp.Queue()
示例#19
0
def upload_photos_to_album(album_id, photos):
    print('Upload photos to album %s' % album_id)
    if os.path.isfile(photos):
        files = [os.path.basename(photos)]
        output = os.path.dirname(photos)
    else:
        files = os.listdir(photos)
        output = photos
    done_file = os.path.join(output, '%s_done.txt' % album_id)
    finished = read_list(done_file)
    error_count = 0
    for f in files:
        image = os.path.join(output, f)
        _, ext = os.path.splitext(f)
        if not ext or ext.lower() not in ['.jpg', '.png', '.gif']:
            # print('Invalid %s' % image)
            continue
        try:
            if f not in finished:
                print('Uploading %s' % image)
                api.photo_upload(album_id, image, f)
                finished.append(f)
                write_list(done_file, finished)
                time.sleep(random.randint(1, 3))
            else:
                print('Skip %s' % image)
        except KeyboardInterrupt, e:
            print("User interrupt, quit.")
            raise
        except Exception, e:
            print("Error:%s On uploading :%s" % (e, image))
            traceback.print_exc()
            error_count += 1
            if error_count > 5:
                break
            time.sleep(error_count * 10)
示例#20
0
__license__ = "GPL"

import tensorflow as tf
from utils import read_list
from sklearn.datasets.mldata import fetch_mldata

# Fetch the dataset
dataset = fetch_mldata("USPS")
print("Dataset USPS loaded...")
data = dataset.data
target = dataset.target - 1  # Labels between 0 and 9 to match digits
n_samples = data.shape[0]  # Number of samples in the dataset
n_clusters = 10  # Number of clusters to obtain

# Get the split between training/test set and validation set
test_indices = read_list("split/usps/test")
validation_indices = read_list("split/usps/validation")

# Auto-encoder architecture
input_size = data.shape[1]
hidden_1_size = 500
hidden_2_size = 500
hidden_3_size = 2000
embedding_size = n_clusters
dimensions = [
    hidden_1_size,
    hidden_2_size,
    hidden_3_size,
    embedding_size,  # Encoder layer dimensions
    hidden_3_size,
    hidden_2_size,
示例#21
0
BIN_DIR = os.path.join(DATA_DIR, "bin")

OUTPUT_DIR = os.path.join(".", "output")

os.makedirs(OUTPUT_DIR, exist_ok=True)


np.random.seed(2020)
epoch = 1_000
batch_size = 8
train_split = 0.8
num_train = 13_580
num_test = 4_000
num_valid = 1_000

mos_list = utils.read_list(os.path.join(DATA_DIR, "mos_list.txt"))
train_idx = np.random.randint(0, len(mos_list), int(train_split * len(mos_list)))
mos_list = np.array(mos_list)

train_list = mos_list[train_idx]
valid_list = np.delete(mos_list, train_idx)

train_data = utils.data_generator(
    train_list, BIN_DIR, frame=True, batch_size=batch_size
)
valid_data = utils.data_generator(
    valid_list, BIN_DIR, frame=True, batch_size=batch_size
)

MOSNet = model.CNN()
model = MOSNet.build()
示例#22
0
def test(FLAG):
    print("Reading dataset...")
    # load data
    file_list = [
        FLAG.test_dir + file.replace('_sat.jpg', '')
        for file in os.listdir(FLAG.test_dir) if file.endswith('_sat.jpg')
    ]
    file_list.sort()
    Xtest = read_list(file_list, with_mask=False)

    vgg16 = VGG16(classes=7, shape=(256, 256, 3))
    vgg16.build(vgg16_npy_path=FLAG.init_from, mode=FLAG.mode)

    def initialize_uninitialized(sess):
        global_vars = tf.global_variables()
        is_not_initialized = sess.run(
            [tf.is_variable_initialized(var) for var in global_vars])
        not_initialized_vars = [
            v for (v, f) in zip(global_vars, is_not_initialized) if not f
        ]
        if len(not_initialized_vars):
            sess.run(tf.variables_initializer(not_initialized_vars))

    with tf.Session() as sess:
        if FLAG.save_dir is not None:
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            ckpt = tf.train.get_checkpoint_state(FLAG.save_dir)

            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                print("Model restored %s" % ckpt.model_checkpoint_path)
                sess.run(tf.global_variables())
            else:
                print("No model checkpoint in %s" % FLAG.save_dir)
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.global_variables())
        print("Initialized")

        print("Plot saved in %s" % FLAG.plot_dir)
        for i, fname in enumerate(file_list):
            Xplot = sess.run(
                vgg16.pred,
                feed_dict={
                    vgg16.x: Xtest[i:(i + 1), :],
                    #vgg16.y: Ytest[i:(i+1),:],
                    vgg16.is_train: False
                })
            saveimg = skimage.transform.resize(Xplot[0],
                                               output_shape=(512, 512),
                                               order=0,
                                               preserve_range=True,
                                               clip=False)
            saveimg = label2rgb(saveimg)
            imageio.imsave(
                os.path.join(FLAG.plot_dir,
                             os.path.basename(fname) + "_mask.png"), saveimg)
            print(
                os.path.join(FLAG.plot_dir,
                             os.path.basename(fname) + "_mask.png"))
示例#23
0
文件: test.py 项目: wgwangang/MOSNet
# set dir
DATA_DIR = './data'
BIN_DIR = os.path.join(DATA_DIR, 'bin')
PRE_TRAINED_DIR = './pre_trained'
OUTPUT_DIR = './output'


NUM_TRAIN = 13580
NUM_TEST=4000
NUM_VALID=3000


if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
            
mos_list = utils.read_list(os.path.join(DATA_DIR,'mos_list.txt'))
random.shuffle(mos_list)

train_list= mos_list[0:-(NUM_TEST+NUM_VALID)]
random.shuffle(train_list)
valid_list= mos_list[-(NUM_TEST+NUM_VALID):-NUM_TEST]
test_list= mos_list[-NUM_TEST:]

print('{} for training; {} for valid; {} for testing'.format(NUM_TRAIN, NUM_VALID, NUM_TEST))    

# init model
MOSNet = model.CNN_BLSTM()
model = MOSNet.build()

# load pre-trained weights
model.load_weights(os.path.join(PRE_TRAINED_DIR, 'cnn_blstm.h5'))   # Load the best model   
示例#24
0
from preprocess import get_texts
from config import spacy, FoundationTrilogy, displacy
from utils import read_list
from spacy.tokens import Span
from validation import list_of_values

text = "When Hardin denied owning the Journal"

sentences = [(i, x) for i, x in enumerate(get_texts(FoundationTrilogy))
             if text in x][0]
index = sentences[0]
sentence = sentences[1]
predicted = [
    y for idx, y in enumerate(read_list('predicted')) if idx == index
][0]
print(sentence)
nlp = spacy.load("en_core_web_sm", disable=['ner'])
doc = nlp(sentence)
tags = list_of_values(predicted, doc)

dict_list = []
KEYS = ['start_idx', 'end_idx', 'text', 'type']
[dict_list.append(dict(zip(KEYS, elem))) for elem in tags]
"""spans = []
for sp in tags:
    spans.append(Span(doc, int(sp[0]), int(sp[1]), label=sp[2]))"""

spans = []
for sp in dict_list:
    spans.append(
        Span(doc, int(sp['start_idx']), int(sp['end_idx']), label=sp['type']))
示例#25
0
 def _make(token):
     keys = read_list(os.path.join(data_root, "clean_{}_key_list.txt".format(token)))
     lines = [k + " " + label_kv[k] for k in keys]
     np.random.shuffle(lines)
     output_file = os.path.join(output_dir, "clean_{}.txt".format(token))
     write_list(lines, output_file)
示例#26
0
文件: train.py 项目: rhoposit/MOSNet
# set dir
DATA_DIR = './data_' + args.data
BIN_DIR = os.path.join(DATA_DIR, args.feats)
OUTPUT_DIR = './results_O_alpha/output_' + args.model + "_" + str(
    args.batch_size) + "_" + args.data + "_" + args.feats + "_" + str(alpha)
results_file = OUTPUT_DIR + "/results.pkl"

EPOCHS = args.epoch
BATCH_SIZE = args.batch_size

if args.data == "VC":
    NUM_TRAIN = 13580
    NUM_TEST = 4000
    NUM_VALID = 3000
    mos_list = utils.read_list(os.path.join(DATA_DIR, 'mos_list.txt'))
    random.shuffle(mos_list)
    train_list = mos_list[0:-(NUM_TEST + NUM_VALID)]
    random.shuffle(train_list)
    valid_list = mos_list[-(NUM_TEST + NUM_VALID):-NUM_TEST]
    test_list = mos_list[-NUM_TEST:]
if args.data == "LA":
    train_list = utils.read_list(os.path.join(DATA_DIR, 'train_list.txt'))
    valid_list = utils.read_list(os.path.join(DATA_DIR, 'valid_list.txt'))
    test_list = utils.read_list(os.path.join(DATA_DIR, 'test_list.txt'))
    random.shuffle(train_list)
    random.shuffle(valid_list)
    random.shuffle(test_list)
    NUM_TRAIN = len(train_list)
    NUM_TEST = len(valid_list)
    NUM_VALID = len(test_list)
def parse(file_path):
    lines = read_list(file_path)
    lines = map(str.split, lines)
    files, labels = zip(*lines)
    labels = map(int, labels)
    return files, labels
示例#28
0
def NER(sentence_list, STAGE=True, VALIDATE=False):
    """
    :param VALIDATE: if validate, there is no ovewritten in the txt files
    :return: labeled sentence list
    """
    print("Start NER")
    if STAGE:
        main_characters_ = []
        locations_ = []
        unclassified = []
        unclassified_sent = []
        predicted = []
        for sent in sentence_list:
            predicted.append(['O' for x in range(len(sent))])

        for i in tqdm(range(len(sentence_list))):
            doc = sentence_list[i]
            ents = get_ents_from_doc(doc)
            ents = [
                x for x in ents
                if len(x[0]) > 2 and not x[0].islower() and not x[0].isupper()
            ]
            if any(ents):
                for name, num_list in ents:
                    if len(num_list) > 1:
                        num = num_list[1]
                        token = doc[num]
                    else:
                        num = num_list[0]
                        token = doc[num]

                    if ner_person(doc, token, num):
                        main_characters_.append(name)
                        if len(num_list) == 1:
                            predicted[i][num_list[0]] = 'B-PER'
                        else:
                            predicted[i][num_list[0]] = 'B-PER'
                            predicted[i][num_list[1]] = 'I-PER'

                    elif ner_location(doc, token):
                        locations_.append(name)
                        if len(num_list) == 1:
                            predicted[i][num_list[0]] = 'B-LOC'
                        else:
                            predicted[i][num_list[0]] = 'B-LOC'
                            predicted[i][num_list[1]] = 'I-LOC'
                    else:
                        unclassified.append([name, i])
                        unclassified_sent.append([name, i, num_list])

        location_list = list(set(locations_))
        people_list = list(set([x for x in main_characters_ if len(x) > 2]))
        for ent in people_list:
            if any([x for x in punctuation_tokens if x in ent]):
                people_list.remove(ent)

        # remove bad retrieved entities
        for ent, i in unclassified:
            if any([x for x in punctuation_tokens if x in ent]):
                unclassified.remove([ent, i])

        # NER unclassified
        for tup in unclassified:
            if tup[0] in people_list:
                ner_unclassified_per(predicted, unclassified_sent, tup)
            if tup[0] in location_list:
                ner_unclassified_loc(predicted, unclassified_sent, tup)

        # write_list('predicted', predicted)

    else:
        predicted = read_list('predicted')

    return predicted
示例#29
0
def run(l2_val, dr, n, batch_size, bn):
    # set dir
    DATA_DIR = './data_'+args.data
    BIN_DIR = os.path.join(DATA_DIR, args.feats)
    OUTPUT_DIR = './results_R3/output_'+args.model+"_"+str(batch_size)+"_"+args.data+"_"+args.feats+"_"+str(l2_val)+"_"+str(dr)+"_"+str(n)+"_"+str(bn)
    results_file = OUTPUT_DIR+"/results.pkl"

    EPOCHS = args.epoch
    BATCH_SIZE = batch_size

    if args.data == "VC":
        NUM_TRAIN = 13580
        NUM_TEST=4000
        NUM_VALID=3000
        mos_list = utils.read_list(os.path.join(DATA_DIR,'mos_list.txt'))
        random.shuffle(mos_list)
        train_list= mos_list[0:-(NUM_TEST+NUM_VALID)]
        random.shuffle(train_list)
        valid_list= mos_list[-(NUM_TEST+NUM_VALID):-NUM_TEST]
        test_list= mos_list[-NUM_TEST:]
        train_data_feat, train_data_mos = utils.data_rep(train_list, BIN_DIR)
        valid_data_feat, valid_data_mos = utils.data_rep(valid_list, BIN_DIR)
    if args.data == "LA":
        test_list = utils.read_list(os.path.join(DATA_DIR,'test_list.txt'))
        train_data_feat = np.load(DATA_DIR+'/'+args.feats+'_X_train.npy')
        train_data_mos = np.load(DATA_DIR+'/'+args.feats+'_y_train.npy')
        valid_data_feat = np.load(DATA_DIR+'/'+args.feats+'_X_valid.npy')
        valid_data_mos = np.load(DATA_DIR+'/'+args.feats+'_y_valid.npy')
        NUM_TRAIN = train_data_feat.shape[0]
        NUM_TEST=valid_data_feat.shape[0]
        NUM_VALID=len(test_list)
    
    

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
            

    print('{} for training; {} for valid; {} for testing'.format(NUM_TRAIN, NUM_TEST, NUM_VALID))

    # CNN-LDA has 100, and CNN-PCA has 512 ?? 
    rep_dims = {'DS-image':4096, 'CNN':100, 'xvec_0':512, 'xvec_1':512, 'xvec_2':512, 'xvec_3':512, 'xvec_4':512, 'xvec_5':512}
    

    # init model
    if args.model == 'CNN':
        dim = rep_dims[args.feats]
        MOSNet = model_rep.CNN(dim, l2_val, dr, n, bn)
#    elif args.model == 'FFN':
#        dim = rep_dims[args.feats]
#        MOSNet = model_rep.FFN(dim, dr, n, bn)
    else:
        raise ValueError('please specify model to train with, CNN, FFN')
        sys.exit()


    model = MOSNet.build()
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),metrics=["mean_absolute_error"],
        loss="mse")

    
    
    CALLBACKS = [
        keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(OUTPUT_DIR,'mosnet.h5'),
            save_best_only=True,
            monitor='val_loss',
            verbose=1),
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            mode='min',
            min_delta=0,
            patience=5,
            verbose=1)
    ]


    train_data_feat = np.expand_dims(train_data_feat, axis=3)
    valid_data_feat = np.expand_dims(valid_data_feat, axis=3)
    print(train_data_feat.shape)
    print(train_data_mos.shape)
    # start fitting model
    hist = model.fit(x=train_data_feat, y=train_data_mos,
                     epochs=EPOCHS,
                     callbacks=CALLBACKS,
                     shuffle=True,
                     batch_size=BATCH_SIZE,
                     validation_data=(valid_data_feat, valid_data_mos),
                     verbose=1)

    # plot testing result
    
    model.load_weights(os.path.join(OUTPUT_DIR,'mosnet.h5'),)   # Load the best model   

    print('testing...')
    MOS_Predict=np.zeros([len(test_list),])
    MOS_true   =np.zeros([len(test_list),])
    df = pd.DataFrame(columns=['audio', 'true_mos','predict_mos','system_ID','speaker_ID'])

    for i in tqdm(range(len(test_list))):

        if args.data == "VC":
            filepath=test_list[i].split(',')
            filename=filepath[0].split('.')[0]
            sysid = ""
            speakerid = ""
            mos=float(filepath[1])
        elif args.data == "LA":
            filepath=test_list[i].split(',')
            filename=filepath[2].split('.')[0]
            sysid = filepath[1]
            speakerid = filepath[0]
            mos=float(filepath[3])

        _DS = utils.read_rep(os.path.join(BIN_DIR,filename+'.npy'))
        
        _DS = np.expand_dims(_DS, axis=3)
        Average_score=model.predict(_DS, verbose=0, batch_size=1)

        MOS_Predict[i]=Average_score
        MOS_true[i] =mos
        

        
        df = df.append({'audio': filepath[0], 
                        'true_mos': MOS_true[i], 
                        'predict_mos': MOS_Predict[i], 
                        'system_ID': sysid, 
                        'speaker_ID': speakerid}, 
                       ignore_index=True)
    
    df.to_pickle(results_file)

    plt.style.use('seaborn-deep')
    x = df['true_mos']
    y = df['predict_mos']
    bins = np.linspace(1, 5, 40)
    plt.figure(2)
    plt.hist([x, y], bins, label=['true_mos', 'predict_mos'])
    plt.legend(loc='upper right')
    plt.xlabel('MOS')
    plt.ylabel('number') 
    plt.savefig('./'+OUTPUT_DIR+'/MOSNet_distribution.png', dpi=150)


    LCC=np.corrcoef(MOS_true, MOS_Predict)
    print('[UTTERANCE] Linear correlation coefficient= %f' % LCC[0][1])
    SRCC=scipy.stats.spearmanr(MOS_true.T, MOS_Predict.T)
    print('[UTTERANCE] Spearman rank correlation coefficient= %f' % SRCC[0])    
    MSE=np.mean((MOS_true-MOS_Predict)**2)
    print('[UTTERANCE] Test error= %f' % MSE)
        


    # Plotting scatter plot
    M=np.max([np.max(MOS_Predict),5])
    plt.figure(3)
    plt.scatter(MOS_true, MOS_Predict, s =15, color='b',  marker='o', edgecolors='b', alpha=.20)
    plt.xlim([0.5,M])
    plt.ylim([0.5,M])
    plt.xlabel('True MOS')
    plt.ylabel('Predicted MOS')
    plt.title('Utterance-Level')
    plt.savefig('./'+OUTPUT_DIR+'/MOSNet_scatter_plot.png', dpi=150)



    if args.data == "VC":
        # load vcc2018_system
        sys_df = pd.read_csv(os.path.join(DATA_DIR,'vcc2018_system.csv'))
        df['system_ID'] = df['audio'].str.split('_').str[-1].str.split('.').str[0] + '_' + df['audio'].str.split('_').str[0]
    elif args.data == "LA":
        # load LA 2019 system
        sys_df = pd.read_csv(os.path.join(DATA_DIR,'LA_mos_system.csv'))
     
    sys_result_mean = df[['system_ID', 'predict_mos']].groupby(['system_ID']).mean()
    sys_mer_df = pd.merge(sys_result_mean, sys_df, on='system_ID')                                                                                                                 

    sys_true = sys_mer_df['mean']
    sys_predicted = sys_mer_df['predict_mos']
    print(sys_true)
    print(sys_predicted)
    print(sys_true.shape)
    print(sys_predicted.shape)
    LCC=np.corrcoef(sys_true, sys_predicted)
    print('[SYSTEM] Linear correlation coefficient= %f' % LCC[0][1])
    SRCC=scipy.stats.spearmanr(sys_true.T, sys_predicted.T)
    print('[SYSTEM] Spearman rank correlation coefficient= %f' % SRCC[0])
    MSE=np.mean((sys_true-sys_predicted)**2)
    print('[SYSTEM] Test error= %f' % MSE)
        

    # Plotting scatter plot
    M=np.max([np.max(sys_predicted),5])
    # m=np.max([np.min(sys_predicted)-1,0.5])
    plt.figure(4)
    plt.scatter(sys_true, sys_predicted, s =25, color='b',  marker='o', edgecolors='b')
    plt.xlim([1,M])
    plt.ylim([1,M])
    plt.xlabel('True MOS')
    plt.ylabel('Predicted MOS')
    plt.title('System-Level')

    # # add system id
    # for i in range(len(sys_mer_df)):
    #     sys_ID = mer_df['system_ID'][i]
    #     x = mer_df['mean'][i]
    #     y = mer_df['predict_mos'][i]
    #     plt.text(x-0.05, y+0.1, sys_ID, fontsize=8)
    plt.savefig('./'+OUTPUT_DIR+'/MOSNet_system_scatter_plot.png', dpi=150)


    
    if args.data == "LA":
        spk_df = pd.read_csv(os.path.join(DATA_DIR,'LA_mos_speaker.csv'))
        spk_result_mean = df[['speaker_ID', 'predict_mos']].groupby(['speaker_ID']).mean()
        spk_mer_df = pd.merge(spk_result_mean, spk_df, on='speaker_ID')                          
        spk_result_mean = df[['speaker_ID', 'predict_mos']].groupby(['speaker_ID']).mean()
        spk_mer_df = pd.merge(spk_result_mean, spk_df, on='speaker_ID')                                                                                                                 
        spk_true = spk_mer_df['mean']
        spk_predicted = spk_mer_df['predict_mos']
        LCC=np.corrcoef(spk_true, spk_predicted)
        print('[SPEAKER] Linear correlation coefficient= %f' % LCC[0][1])
        SRCC=scipy.stats.spearmanr(spk_true.T, spk_predicted.T)
        print('[SPEAKER] Spearman rank correlation coefficient= %f' % SRCC[0])
        MSE=np.mean((spk_true-spk_predicted)**2)
        print('[SPEAKER] Test error= %f' % MSE)
            
    # Plotting scatter plot
    M=np.max([np.max(spk_predicted),5])
    # m=np.max([np.min(spk_predicted)-1,0.5])
    plt.figure(4)
    plt.scatter(spk_true, spk_predicted, s =25, color='b',  marker='o', edgecolors='b')
    plt.xlim([1,M])
    plt.ylim([1,M])
    plt.xlabel('True MOS')
    plt.ylabel('Predicted MOS')
    plt.title('Speaker-Level')
    
    # # add system id
    # for i in range(len(spk_mer_df)):
    #     spk_ID = mer_df['speaker_ID'][i]
    #     x = mer_df['mean'][i]
    #     y = mer_df['predict_mos'][i]
    #     plt.text(x-0.05, y+0.1, spk_ID, fontsize=8)
    plt.savefig('./'+OUTPUT_DIR+'/MOSNet_speaker_scatter_plot.png', dpi=150)
示例#30
0
        ],
        "dev": ["dev-clean", "dev-other"],
    }

    num_wordpieces = 5000
    nbest = 10
    prefix = "librispeech-train-all-unigram-{}".format(num_wordpieces)
    prefix = os.path.join(args.dst, prefix)
    textfile = os.path.join(args.dst, "train-all.text")
    model = prefix + ".model"
    vocab = prefix + ".vocab"

    # prepare data
    sys.stdout.write("preparing data...\n")
    sys.stdout.flush()
    train_text = utils.read_list(args.src, filelists["train"])
    dev_text = utils.read_list(args.src, filelists["dev"])

    with open(textfile, "w") as f:
        for line in train_text:
            f.write(line)
            f.write("\n")

    word_dict = set()
    for line in train_text + dev_text:
        words = line.split()
        for w in words:
            word_dict.add(w)
    word_dict = sorted(word_dict)

    # train
示例#31
0
def train(FLAG):
    print("Reading dataset...")
    # load data
    Xtrain, Ytrain = read_images(TRAIN_DIR), read_masks(TRAIN_DIR, onehot=True)
    Xtest, Ytest = read_images(VAL_DIR), read_masks(VAL_DIR, onehot=True)
    track = [
        "hw3-train-validation/validation/0008",
        "hw3-train-validation/validation/0097",
        "hw3-train-validation/validation/0107"
    ]
    Xtrack, Ytrack = read_list(track)

    vgg16 = VGG16(classes=7, shape=(256, 256, 3))
    vgg16.build(vgg16_npy_path=FLAG.init_from,
                mode=FLAG.mode,
                keep_prob=FLAG.keep_prob)

    saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
    checkpoint_path = os.path.join(FLAG.save_dir, 'model.ckpt')

    def initialize_uninitialized(sess):
        global_vars = tf.global_variables()
        is_not_initialized = sess.run(
            [tf.is_variable_initialized(var) for var in global_vars])
        not_initialized_vars = [
            v for (v, f) in zip(global_vars, is_not_initialized) if not f
        ]
        if len(not_initialized_vars):
            sess.run(tf.variables_initializer(not_initialized_vars))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # hyper parameters
        batch_size = 32
        epoch = 500
        early_stop_patience = 50
        min_delta = 0.0001
        opt_type = 'adam'

        # recorder
        epoch_counter = 0

        # optimizer
        global_step = tf.Variable(0, trainable=False)

        # Passing global_step to minimize() will increment it at each step.
        if opt_type is 'sgd':
            start_learning_rate = FLAG.lr
            half_cycle = 2000
            learning_rate = tf.train.exponential_decay(start_learning_rate,
                                                       global_step,
                                                       half_cycle,
                                                       0.5,
                                                       staircase=True)
            opt = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                             momentum=0.9,
                                             use_nesterov=True)
        else:
            start_learning_rate = FLAG.lr
            half_cycle = 2000
            learning_rate = tf.train.exponential_decay(start_learning_rate,
                                                       global_step,
                                                       half_cycle,
                                                       0.5,
                                                       staircase=True)
            opt = tf.train.AdamOptimizer(learning_rate=learning_rate)

        obj = vgg16.loss
        train_op = opt.minimize(obj, global_step=global_step)

        # progress bar
        ptrain = IntProgress()
        pval = IntProgress()
        display(ptrain)
        display(pval)
        ptrain.max = int(Xtrain.shape[0] / batch_size)
        pval.max = int(Xtest.shape[0] / batch_size)

        # re-initialize
        initialize_uninitialized(sess)

        # reset due to adding a new task
        patience_counter = 0
        current_best_val_loss = np.float('Inf')

        # optimize when the aggregated obj
        while (patience_counter < early_stop_patience
               and epoch_counter < epoch):

            # start training
            stime = time.time()
            bar_train = Bar(
                'Training',
                max=int(Xtrain.shape[0] / batch_size),
                suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds')
            bar_val = Bar(
                'Validation',
                max=int(Xtest.shape[0] / batch_size),
                suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds')

            train_loss, train_accu = 0.0, 0.0
            for i in range(int(Xtrain.shape[0] / batch_size)):
                st = i * batch_size
                ed = (i + 1) * batch_size
                loss, accu, _ = sess.run(
                    [obj, vgg16.accuracy, train_op],
                    feed_dict={
                        vgg16.x: Xtrain[st:ed, :],
                        vgg16.y: Ytrain[st:ed, :],
                        vgg16.is_train: True
                    })
                train_loss += loss
                train_accu += accu
                ptrain.value += 1
                ptrain.description = "Training %s/%s" % (ptrain.value,
                                                         ptrain.max)
            train_loss = train_loss / ptrain.value
            train_accu = train_accu / ptrain.value

            # validation
            val_loss = 0
            val_accu = 0
            for i in range(int(Xtest.shape[0] / batch_size)):
                st = i * batch_size
                ed = (i + 1) * batch_size
                loss, accu = sess.run(
                    [obj, vgg16.accuracy],
                    feed_dict={
                        vgg16.x: Xtest[st:ed, :],
                        vgg16.y: Ytest[st:ed, :],
                        vgg16.is_train: False
                    })
                val_loss += loss
                val_accu += accu
                pval.value += 1
                pval.description = "Testing %s/%s" % (pval.value, pval.value)
            val_loss = val_loss / pval.value
            val_accu = val_accu / pval.value

            # plot
            if epoch_counter % 10 == 0:
                Xplot = sess.run(vgg16.pred,
                                 feed_dict={
                                     vgg16.x: Xtrack[:, :],
                                     vgg16.y: Ytrack[:, :],
                                     vgg16.is_train: False
                                 })

                for i, fname in enumerate(track):
                    saveimg = skimage.transform.resize(Xplot[i],
                                                       output_shape=(512, 512),
                                                       order=0,
                                                       preserve_range=True,
                                                       clip=False)
                    saveimg = label2rgb(saveimg)
                    imageio.imwrite(
                        os.path.join(
                            FLAG.save_dir,
                            os.path.basename(fname) + "_pred_" +
                            str(epoch_counter) + ".png"), saveimg)
                    print(
                        os.path.join(
                            FLAG.save_dir,
                            os.path.basename(fname) + "_pred_" +
                            str(epoch_counter) + ".png"))

            # early stopping check
            if (current_best_val_loss - val_loss) > min_delta:
                current_best_val_loss = val_loss
                patience_counter = 0
                saver.save(sess, checkpoint_path, global_step=epoch_counter)
                print("save in %s" % checkpoint_path)
            else:
                patience_counter += 1

            # shuffle Xtrain and Ytrain in the next epoch
            idx = np.random.permutation(Xtrain.shape[0])
            Xtrain, Ytrain = Xtrain[idx, :, :, :], Ytrain[idx, :]

            # epoch end
            epoch_counter += 1

            ptrain.value = 0
            pval.value = 0
            bar_train.finish()
            bar_val.finish()

            print(
                "Epoch %s (%s), %s sec >> train loss: %.4f, train accu: %.4f, val loss: %.4f, val accu: %.4f"
                % (epoch_counter, patience_counter,
                   round(time.time() - stime,
                         2), train_loss, train_accu, val_loss, val_accu))
示例#32
0
for i, doc in enumerate(data):
    tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2))
    stemmed_doc = []
    for word in tokenized_doc:
        stemmed_word = stemmer.stem(word)
        if stemmed_word not in stop_words:
            stemmed_doc.append(stemmed_word)
    #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words]
    if stemmed_doc == []: # Empty document after pre-processing: to be removed
        id_to_delete.append(i)
    else:
        processed_data.append(stemmed_doc)
data = processed_data
target = np.delete(target, id_to_delete, axis=0)
#####
keywords = read_list("constraints/reuters/keywords_freq_auto_5", "str")
keywords = [keywords[i].split(" ") for i in range(len(keywords))] # Otherwise don't stem
nr_kw_perclass = 3
kw= np.array(list(itertools.chain(*keywords)))
counter = np.zeros((len(data), len(kw)))
for i in range(len(data)):
  for k in range(len(data[i])):
    for j in range(len(kw)):
            if kw[j]==data[i][k]:
                counter[i][j] = counter[i][j]+1
print(len(data))
window = 50
model_path = "models/reuters_w2v_window" + str(window) + ".model"
if isfile(model_path): # Load if the word2vec model exists
    print("Loading an existing word2vec model trained on the dataset...")
    w2v = Word2Vec.load(model_path)
示例#33
0
https://plotly.com/python/reference/#scatter-texttemplate
https://plotly.com/python/text-and-annotations/
"""
from utils import read_list
import plotly.graph_objects as go

GALAXY = [[425, 400, "Rossem"], [1100, 700, "Haven"], [770, 490, "Neotrantor"],
          [1300, 600, "Askone"], [500, 500, "Tazenda"], [430, 200, "Arcturus"],
          [170, 610, "Kalgan"], [160, 650, "Terminus"], [210, 690, "Anacreon"],
          [1200, 680, "Synnax"], [900, 650, "Radole"], [700, 500, "Trantor"]]
planet_names = [x[2] for x in GALAXY]
planets_x = [x[0] for x in GALAXY]
planets_y = [x[1] for x in GALAXY]
test = ["Tests<br>" + str(i) for i in range(12)]

location_links = read_list("location_links")
hover = [[] for x in range(len(planet_names))]
for link in location_links:
    person = link[1]
    idx = [i for i, x in enumerate(planet_names) if x == link[0]]
    if any(idx):
        hover[idx[0]].append(person)

PEOPLE = [
    "<b>Linked PER entities:<b><br>- " + "<br>- ".join(y)
    for y in [list(set(x)) for x in hover]
]

# Create figure
fig = go.Figure()
示例#34
0
def parse(file_path):
    lines = read_list(file_path)
    lines = map(str.split, lines)
    files, labels = zip(*lines)
    labels = map(int, labels)
    return (files, labels)
示例#35
0
from doubanapi import ApiClient
from utils import read_list, write_list

from config import API_KEY, API_SECRET, USERNAME, PASSWORD

if __name__ == '__main__':
    print(sys.argv)
    if len(sys.argv) < 3:
        print('Usage: python %s album_id dir' % sys.argv[0])
        exit(1)
    api = ApiClient(key=API_KEY, secret=API_SECRET)
    print(api.login(USERNAME, PASSWORD))
    album = sys.argv[1]
    directory = sys.argv[2]
    files = os.listdir(directory)
    finished = read_list('%s.txt' % album)
    error_count = 0
    for f in files:
        image = os.path.join(directory, f)
        try:
            if f not in finished:
                print('Uploading %s' % image)
                api.photo_upload(album, image, f)
                finished.append(image)
                time.sleep(2)
        except Exception, e:
            print("error:%s on uploading :%s" % (e, image))
            error_count += 1
            if error_count > 5:
                break
            time.sleep(error_count * 10)