Exemplo n.º 1
0
def to_csv():
    # read type names
    with open(TYPES + 'types.txt') as f:
        types = lines(f)

    # read type effectiveness, put it into table format
    # [["",    type1, type2],
    #  [type1, 1,     2    ],
    #  [type2, 4,     2    ]]
    with open(TYPES + 'typestable.txt') as f:
        headers = [""] + types
        # effectiveness is a 2d array
        effectiveness = [line.split(" ") for line in lines(f)]

        table = [headers] +\
            [[types[attacker]] +\
                [effectiveness[attacker][defender]
                    for defender in range(len(effectiveness[attacker]))]
                        for attacker in range(len(effectiveness))]

        #print table

    # write the type effectiveness to a csv
    with open(TYPES + 'types.csv', 'wb') as f:
        typesWriter = csv.writer(f)
        typesWriter.writerows(table)
Exemplo n.º 2
0
def processRunning(host, pidfile, name=""):
	cmdline = util.lines(host.execute("[ -f %(pidfile)s ] && (cat %(pidfile)s | xargs -r ps --no-headers --format cmd --pid); true" % {"pidfile": util.escape(pidfile)}))
	if not len(cmdline):
		return False
	if name:
		return name in cmdline[0]
		
Exemplo n.º 3
0
def playlist_contains(song_id, playlist_name):
    if not playlist_exists(playlist_name):
        raise KeyError(f'playlist {playlist_name} doesn\'t exist')

    playlist_dir = file_management.get_playlists_path()
    playlist_dest = os.path.join(playlist_dir, playlist_name)

    return song_id in util.lines(open(playlist_dest))
Exemplo n.º 4
0
 def load_splits_raw(self):
     self.split_files = [
         (
             self.conf.data_dir / self.name / split_name / self.lang
             ).with_suffix(".conllu")
         for split_name in self.split_names]
     self.splits_raw = [
         conllu.parse("\n".join(lines(f)))
         for f in self.split_files]
Exemplo n.º 5
0
def do_it(filename):
    for line in util.lines(filename):
        split = line.split()
        n = Node.get_node(split[0] + split[1])
        for inner in range(4, len(split), 4):
            if split[inner] != "no":
                n.add(split[inner + 1] + split[inner + 2], int(split[inner]))

    return len(Node.nodes['shinygold'].distinct_parents())
Exemplo n.º 6
0
def do_it(filename):
    program = [[l.split()[0], int(l.split()[1])] for l in util.lines(filename)]
    result = NOT_FOUND
    pos = 0
    while result == NOT_FOUND:
        curr, pos = replace_nop_jmp(program, pos)
        result = parse_and_run(curr)

    return result
Exemplo n.º 7
0
def do_it(filename):
    for line in util.lines(filename):
        split = line.split()
        n = Node.get_node(''.join(split[0:2]))
        for inner in range(4, len(split), 4):
            if split[inner] != "no":
                n.add(''.join(split[inner + 1:inner + 3]), int(split[inner]))

    return Node.get_node('shinygold').inner_bags() - 1
Exemplo n.º 8
0
def read_sift(sift_fname):
    """ Feature format: [[x, y, scale, orientation], ...] """
    lines = ut.lines(sift_fname)
    if len(lines):
        fd = np.array([map(float, line.split()) for line in lines])
        f = fd[:, :4]
        d = np.uint8(fd[:, 4:])
        return f, d
    else:
        return np.zeros((4, 0)), np.uint8(np.zeros((128, 0)))
Exemplo n.º 9
0
 def testLines(self):
     pass
     from util import lines
     f = open('test_input.txt')
     try:
         for line, item in enumerate(lines(f)):
             print('%d\t %s' % (line, item))
     finally:
         f.close()
     print("test")
     self.failUnless(1 == 1, "Failed")
Exemplo n.º 10
0
    def parse_file(self, file):
        sents = split_iter(lines(file), lambda line: line == "")
        sents = islice(filter(bool, sents), self.conf.max_ninst)

        def parse_sent(sent):
            parts = map_assert(
                str.split, lambda parts: len(parts) in {3, 7}, sent)
            forms, tags = zip(*map(lambda ps: (ps[0], ps[-1]), parts))
            assert len(forms) == len(tags) == len(sent)
            return [
                {"form": form, "ner": tag} for form, tag in zip(forms, tags)]

        return list(map_skip_assert_error(parse_sent, sents, verbose=True))
Exemplo n.º 11
0
def downloadCaptureUri(host, name, onlyLatest=False):
	filename = "%s.pcap" % name
	path = host.getHostServer().randomFilename()
	if onlyLatest:
		print path
		latest = util.lines(host.execute("ls -t1 %s | head -n1" % _remoteDir(name)))[0]
		if latest:
			fileutil.copy(host, "%s/%s" % (_remoteDir(name), latest), path)
	else:
		host.execute("tcpslice -w %s %s/*" % (path, _remoteDir(name)))
	if not fileutil.existsFile(host, path) or not fileutil.fileSize(host, path):
		raise fault.new("No packages captured yet")
	return host.getHostServer().downloadGrant(path, filename=filename)
Exemplo n.º 12
0
def show_playlist(playlist_name):
    filename = f'{playlist_name}.playlist'
    playlist_dir = file_management.get_playlists_path()
    playlist_dest = os.path.join(playlist_dir, filename)

    if not os.path.exists(playlist_dest):
        print(f'playlist does not exist: {playlist_dest}')
        raise KeyError

    res = [playlist_name]
    for song_id in util.lines(open(playlist_dest)):
        res.append(f'[{song_id}] {songs.get_song_info(song_id)["title"]}')

    return res
Exemplo n.º 13
0
def run():
    print '''<div type="book" osisID="%s" canonical="true">
<title type="main">%s</title>''' % (bookID, title)

    for line in lines(sys.stdin):
        line = re.sub(
            chapterPattern,
            r'<chapter osisID="%s." chapterTitle="\1">\n<title type="chapter">\1</title>'
            % bookID, line)
        line = re.sub(versePattern,
                      r'<verse osisID="%s.\1.\2">\3</verse>' % bookID, line)
        line = re.sub(r'\*\*', r'</chapter>', line)
        print line.strip()
    print '</div>'
Exemplo n.º 14
0
 def __init__(self, path, words, dim=300, normalize=True, **kwargs):
     seen = []
     vs = {}
     for line in lines(path):
         split = line.split()
         w = split[0]
         if w in words:
             seen.append(w)
             vs[w] = np.array(list(map(float, split[1:])), dtype='float32')
     self.iw = seen
     self.wi = {w: i for i, w in enumerate(self.iw)}
     self.m = np.vstack(vs[w] for w in self.iw)
     if normalize:
         self.normalize()
Exemplo n.º 15
0
def add_song_to_playlist(song_id: str, playlist_name):
    filename = f'{playlist_name}.playlist'
    playlist_dir = file_management.get_playlists_path()
    playlist_dest = os.path.join(playlist_dir, filename)

    if not os.path.exists(playlist_dest):
        raise KeyError(f'playlist does not exist: {playlist_dest}')

    if song_id not in file_management.get_song_ids():
        raise KeyError('song id does not exist')

    current_songs = util.lines(open(playlist_dest))

    if song_id in current_songs:
        print('warning! song {song_id} already in current songs! type "y" to continue')
        if input('>') != 'y':
            return

    with open(playlist_dest, 'a') as playlist:
        playlist.write(song_id + '\n')
Exemplo n.º 16
0
 def parse(filename):
     return ContinuousWood(util.lines(filename))
Exemplo n.º 17
0
def interfaceExists(host, iface):
	return util.lines(host.execute("[ -d /sys/class/net/%s ]; echo $?" % iface))[0] == "0"
Exemplo n.º 18
0
def interfaceBridge(host, iface):
	return util.lines(host.execute("[ -d /sys/class/net/%s/brport/bridge ] && basename $(readlink /sys/class/net/%s/brport/bridge)" % (iface, iface)))[0]
Exemplo n.º 19
0
def bridgeExists(host, bridge):
	return util.lines(host.execute("[ -d /sys/class/net/%s/brif ]; echo $?" % bridge))[0] == "0"
Exemplo n.º 20
0
def get_song_ids():

    dest = get_ids_path()

    with open(dest) as f:
        return util.lines(f)
Exemplo n.º 21
0
Usage: 'measure dict-file'
Pearls of Computer Science, Week 2
"""

# standard module to access command-line parameter list sys.argv
import sys
# standard module containing process_time function
import time

from ordsearch import linear
from ordsearch import binary

# read words from dictionary file
import util

words = util.lines(sys.argv[0])

# ask for the first word
value = input("Search for first word? ")
# continue as long as a word was typed
while value != "":
    # measure time for linear searching
    lstart = time.process_time()
    lresult = linear(words, value)
    lend = time.process_time()
    # time values are fractions of seconds;
    # multiply by a million and round to get microseconds
    ltime = round((lend - lstart) * 1000000)

    # measure time for binary searching
    bstart = time.process_time()
Exemplo n.º 22
0
    parR = np.empty(numBins, np.dtype('float64'))
    parRerr = np.empty(numBins, np.dtype('float64'))
    parL = np.empty(numBins, np.dtype('float64'))
    parLerr = np.empty(numBins, np.dtype('float64'))

    for ibin in range(0, numBins):
        hists.append(
            r.TH1D("tmp_" + str(ibin), "tmp_" + str(ibin), 400, 0., 2.))
        energies.append(int(ibin * energyBin))
    energies.append(maxEnergy)

    if numBins > len(ut.colors()):
        for i in range(0, numBins - len(ut.colors())):
            ut.colors().append(ut.colors()[i])
            ut.alpha().append(0.3)
            ut.lines().append(ut.lines()[i])
            ut.width().append(ut.width()[i])
            ut.fill().append(3001)

    leg = r.TLegend(0.25, 0.55, 0.3, 0.85)
    leg.SetTextFont(132)
    leg.SetTextSize(0.05)
    leg.SetFillColor(0)
    leg.SetFillStyle(0)

    ifile = "/eos/user/c/cneubuse/miniCalo2/pred/stage" + str(st) + "/out.root"

    miX = 0.8
    maX = 1.18
    #  if args.stage>1:
    #    miX=0.8
Exemplo n.º 23
0
 def parse_file(self, file):
     sents = split_iter(lines(file), lambda line: line == "")
     sents = islice(filter(bool, sents), self.conf.max_ninst)
     sents = map("\n".join, sents)
     return [conllu.parse(sent)[0] for sent in sents]
Exemplo n.º 24
0
            continue
        print proc_num, "Running", name
        subredditgen.main(name)
        word_dict = util.load_pickle(DICTS.format(name))
        word_dict.filter_extremes(no_above=0.1, no_below=100)
        to_keep = sorted(word_dict.dfs, key=lambda w : word_dict.dfs[w], reverse=True)[:5000]
        word_dict.filter_tokens(good_ids=to_keep)
        sub_vecs = create_representation("SVD", constants.SUBREDDIT_EMBEDDINGS.format(name))
        pos_seeds, neg_seeds = seeds.twitter_seeds()
        sub_vecs = sub_vecs.get_subembed(set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))
        pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True,
                nn=25, beta=0.9, num_boots=50, n_procs=10)
        util.write_pickle(pols, POLARITIES + name + ".pkl")

if __name__ == "__main__":
    queue = Queue()
    id = int(sys.argv[1])
    valid_ids = set(range(250, 256))
    for i, line in enumerate(util.lines(NAMES)):
        if i in valid_ids:
            name = line.split()[0]
            queue.put(name)
    print queue.qsize()
    procs = [Process(target=worker, args=[i, queue]) for i in range(1)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()


Exemplo n.º 25
0
    group.add_argument('-l', '--list', action="store_true")
    group.add_argument('-c', '--create', type=str, metavar='<PLAYLIST>')
    group.add_argument('-d', '--delete', type=str, metavar='<PLAYLIST>')
    group.add_argument('-a', '--add-song', type=str, metavar='<PLAYLIST>')
    group.add_argument('-r', '--remove-song', type=str, metavar='<PLAYLIST>')

    parser.add_argument('-s', '--song', type=str, metavar='<SONG NAME>')
    args = parser.parse_args()
    # print(args)
    if args.list:
        playlist_dir = file_management.get_playlists_path()
        for filename in os.listdir(playlist_dir):
            abspath = os.path.join(playlist_dir, filename)
            print(os.path.splitext(filename)[0])
            for song_id in util.lines(open(abspath)):
                print(f'[{song_id}] {songs.get_song_info(song_id)["title"]}')

    elif args.create is not None:
        create_playlist(args.create)

    elif args.delete is not None:
        delete_playlist(args.delete)

    elif args.add_song is not None:
        if args.song is None:
            sys.exit('provide a song (-s)')
        add_song_to_playlist(args.song, args.add_song)

    elif args.remove_song is not None:
        if args.song is None:
Exemplo n.º 26
0
__author__ = 'Egbert'

from ordsearch import binary
from ordsearch import linear
from util import lines
import searchmeasure


print(binary(lines("Unabr.dict"),"eagle"))
print(binary(lines("Unabr.dict"),"zygose"))

searchmeasure.search("Unabr.dict","eagle")
Exemplo n.º 27
0
    songs_dir = file_management.get_songs_path()
    for filepath in glob.glob(os.path.join(songs_dir, f'{song_id}.*')):
        abspath = os.path.join(songs_dir, filepath)
        print(f'deleting {abspath}')
        os.remove(abspath)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Song configuration')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-l', '--list', action="store_true")
    group.add_argument('-a', '--add-url', type=str, metavar='<URL>')
    group.add_argument('-d', '--delete', type=str, metavar='<ID>')
    args = parser.parse_args()
    # print(args)
    if args.list:
        ids_path = file_management.get_ids_path()
        all_ids = util.lines(open(ids_path))
        if len(all_ids) == 0:
            print('no songs!')
        for song_id in all_ids:
            json_data = get_song_info(song_id)
            print(f'[{song_id}] {json_data["title"]}')

    elif args.add_url is not None:
        link = args.add_url
        add_song(link)

    elif args.delete is not None:
        song_id = args.delete
        remove_song(song_id)
Exemplo n.º 28
0
def interfaceBridge(host, iface):
	try:
		return util.lines(host.execute("[ -d /sys/class/net/%s/brport/bridge ] && basename $(readlink /sys/class/net/%s/brport/bridge)" % (util.identifier(iface), util.identifier(iface))))[0]
	except exceptions.CommandError:
		return False
Exemplo n.º 29
0
class Transformer():

    nspecial_symbols_segment1 = 2  # [CLS] sent1... [SEP]
    nspecial_symbols_segment2 = 1  # sent2... [SEP]
    add_tokens_key = 'additional_special_tokens'
    supported_langs = set(
        lines(Path(__file__).parent / "data" / "bert_langs.wiki"))

    def __init__(self,
                 model_name,
                 device=None,
                 max_len=None,
                 auto_model_cls=AutoModel,
                 only_tokenizer=False,
                 custom_n_hidden=None,
                 custom_n_layers=None):
        super().__init__()
        self.randinit = model_name.endswith('-randinit')
        if self.randinit:
            model_name = model_name[:-len('-randinit')]
        self.model_name = model_name
        self.device = device or _device
        do_lower_case = "uncased" in model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name, do_lower_case=do_lower_case)
        for name in 'mask cls sep bos eos'.split():
            token = getattr(self.tokenizer, name + '_token')
            setattr(self, name.upper(), token)
        # self.begin_mention_idx = self.tokenizer.convert_tokens_to_ids(
        #     self.BEGIN_MENTION)

        if self.model_name.startswith('roberta'):
            self.BEGIN_MENTION = 'madeupword0000'
            self.END_MENTION = 'madeupword0001'
            self.add_special_symbols = self.add_special_symbols_roberta
        else:
            self.BEGIN_MENTION = '[unused0]'
            self.END_MENTION = '[unused1]'
            self.add_special_symbols = self.add_special_symbols_bert
        self.BEGIN_MENTION_IDX = self.tokenizer.convert_tokens_to_ids(
            self.BEGIN_MENTION)
        self.begin_mention_idx = self.BEGIN_MENTION_IDX
        self.END_MENTION_IDX = self.tokenizer.convert_tokens_to_ids(
            self.END_MENTION)
        additional_special_tokens = [self.BEGIN_MENTION, self.END_MENTION]
        self.tokenizer.add_special_tokens(
            {self.add_tokens_key: additional_special_tokens})
        self.max_len = max_len or self.tokenizer.max_len
        self.pad_idx = self.tokenizer.pad_token_id
        self.mask_idx = self.tokenizer.mask_token_id
        self.vocab_size = len(self.tokenizer)

        if not only_tokenizer:
            if self.randinit:
                model_config = AutoConfig.from_pretrained(self.model_name)
                print('creating model with random init', self.model_name)
                if custom_n_hidden:
                    ratio = model_config.intermediate_size // model_config.hidden_size
                    model_config.hidden_size = custom_n_hidden
                    model_config.intermediate_size = ratio * custom_n_hidden
                if custom_n_layers:
                    model_config.num_hidden_layers = custom_n_layers
                self.model = auto_model_cls.from_config(model_config)
                print('custom model_config:', model_config)
            else:
                print('loading model', self.model_name)
                self.model = auto_model_cls.from_pretrained(model_name)
            word_emb = self.model.get_input_embeddings().weight
            self.dim = word_emb.size(1)
            device_count = torch.cuda.device_count()
            self.model.to(device=self.device)

    def update_special_tokens(self, additional_special_tokens):
        current = self.tokenizer.special_tokens_map[self.add_tokens_key]
        self.tokenizer.add_special_tokens(
            {self.add_tokens_key: current + additional_special_tokens})

    def __call__(self, *args, **kwargs):
        return self.model(*args, **kwargs)

    def tokenize(self, text, masked_idxs=None):
        if isinstance(text, str):
            tokenized_text = self.tokenizer.tokenize(text)
            if masked_idxs is not None:
                for idx in masked_idxs:
                    tokenized_text[idx] = self.MASK
            tokenized = self.add_special_symbols(tokenized_text)
            return tokenized
        return list(map(self.tokenize, text))

    def add_special_symbols_bert(self, tokenized_text):
        return [self.CLS] + tokenized_text + [self.SEP]

    def add_special_symbols_roberta(self, tokenized_text):
        return [self.BOS] + tokenized_text + [self.EOS]

    def tokenize_sentence_pair(self, sent1, sent2):
        tokenized_sent1 = self.tokenizer.tokenize(sent1)
        tokenized_sent2 = self.tokenizer.tokenize(sent2)
        return self.add_special_symbols_sent_pair(tokenized_sent1,
                                                  tokenized_sent2)

    def add_special_symbols_sent_pair(self, tokenized_sent1, tokenized_sent2):
        return ([self.CLS] + tokenized_sent1 + [self.SEP] + tokenized_sent2 +
                [self.SEP])

    def tokenize_to_ids(self,
                        text,
                        masked_idxs=None,
                        pad=True,
                        max_len=None,
                        clip_long_seq=False):
        tokens = self.tokenize(text, masked_idxs)
        return self.convert_tokens_to_ids(tokens,
                                          pad=pad,
                                          max_len=max_len,
                                          clip_long_seq=clip_long_seq)

    def tokenize_sentence_pair_to_ids(self, sent1, sent2):
        tokenized_sent1 = self.tokenizer.tokenize(sent1)
        segment1_len = len(tokenized_sent1) + self.nspecial_symbols_segment1
        tokenized_sent2 = self.tokenizer.tokenize(sent2)
        segment2_len = len(tokenized_sent2) + self.nspecial_symbols_segment2
        tokenized_sents = self.add_special_symbols(tokenized_sent1,
                                                   tokenized_sent2)
        padded_ids, padding_mask = self.convert_tokens_to_ids(tokenized_sents)
        segment_ids = self.segment_ids(segment1_len, segment2_len)
        return padded_ids, padding_mask, segment_ids

    def mask_mention_and_tokenize_context(self, collapse_mask, *, left_ctx,
                                          mention, right_ctx, **kwargs):
        left_ctx_tokenized = self.tokenize(left_ctx)[:-1]  # remove [SEP]
        if collapse_mask:
            masked_mention = [self.MASK]
        else:
            mention_tokenized = self.tokenize(mention)
            masked_mention = [self.MASK] * len(mention_tokenized)
        right_ctx_tokenized = self.tokenize(right_ctx)[1:]  # remove [CLS]
        tokens = left_ctx_tokenized + masked_mention + right_ctx_tokenized
        return tokens

    def mask_mention_and_tokenize_context_to_ids(self,
                                                 left_ctx,
                                                 mention,
                                                 right_ctx,
                                                 collapse_mask=True,
                                                 pad=True):
        tokens = self.mask_mention_and_tokenize_context(
            collapse_mask=collapse_mask,
            left_ctx=left_ctx,
            mention=mention,
            right_ctx=right_ctx)
        return tokens, self.convert_tokens_to_ids(tokens, pad=pad)

    def mask_mentions_and_tokenize_contexts_to_ids(self,
                                                   mentions_and_contexts,
                                                   collapse_mask=True):
        tokens = [
            self.mask_mention_and_tokenize_context(collapse_mask=collapse_mask,
                                                   **ment_ctx)
            for ment_ctx in mentions_and_contexts
        ]
        return tokens, self.convert_tokens_to_ids(tokens)

    def convert_tokens_to_ids(self,
                              tokens,
                              pad=True,
                              max_len=None,
                              clip_long_seq=False):
        max_len = max_len or self.max_len
        if not tokens:
            dummy = torch.tensor([]).to(device=self.device)
            if pad:
                return dummy.to(dtype=torch.long), dummy.to(dtype=torch.uint8)
            return dummy
        elif isinstance(tokens[0], list):
            token_idss = map(self.tokenizer.convert_tokens_to_ids, tokens)
            padded_ids = torch.zeros(
                (len(tokens, ), max_len), dtype=torch.long) + self.pad_idx
            for row_idx, token_ids in enumerate(token_idss):
                token_ids = torch.tensor(token_ids)
                if clip_long_seq:
                    token_ids = token_ids[:max_len]
                padded_ids[row_idx, :len(token_ids)] = token_ids
            padded_ids = padded_ids.to(device=self.device)
            mask = padded_ids != self.pad_idx
            return padded_ids, mask
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        ids = torch.tensor([token_ids]).to(device=self.device)
        if clip_long_seq:
            ids = ids[:, :max_len]
        else:
            assert ids.size(
                1
            ) <= max_len, f'{ids.size(1)} > {max_len}\n{len(tokens)} {tokens}'
        if pad:
            padded_ids = torch.zeros(1, max_len).to(ids) + self.pad_idx
            padded_ids[0, :ids.size(1)] = ids
            mask = torch.zeros(1, max_len).to(ids)
            mask[0, :ids.size(1)] = 1
            return padded_ids, mask
        else:
            return ids

    def subword_tokenize(self,
                         tokens,
                         mask_start_idx=None,
                         mask_end_idx=None,
                         add_mask_start_end_markers=False,
                         collapse_mask=True,
                         apply_mask=True,
                         add_special_symbols=True):
        """Segment each token into subwords while keeping track of
        token boundaries.

        Parameters
        ----------
        tokens: A sequence of strings, representing input tokens.

        Returns
        -------
        A tuple consisting of:
            - A list of subwords, flanked by the required special symbols.
            - An array of indices into the list of subwords, indicating
                that the corresponding subword is the start of a new
                token. For example, [1, 3, 4, 7] means that the subwords
                1, 3, 4, 7 are token starts, while all other subwords
                (0, 2, 5, 6, 8...) are in or at the end of tokens.
                This list allows selecting Bert hidden states that
                represent tokens, which is necessary in sequence
                labeling.
        """
        if mask_start_idx is not None:
            try:
                mask_starts = list(iter(mask_start_idx))
            except TypeError:
                mask_starts = [mask_start_idx]
            if mask_end_idx is None:
                assert len(mask_starts) == 1
                mask_ends = [mask_starts[0] + 1]
            else:
                try:
                    mask_ends = list(iter(mask_end_idx))
                except TypeError:
                    mask_ends = [mask_end_idx]

            mask_start_ends = list(reversed(list(zip(mask_starts, mask_ends))))
            if apply_mask:
                for mask_start, mask_end in mask_start_ends:
                    if collapse_mask:
                        mask_len = 1
                    else:
                        mention = ' '.join(tokens[mask_start:mask_end])
                        mention_subw = self.tokenize(mention)[1:-1]
                        mask_len = len(mention_subw)
                    tokens = (tokens[:mask_start] + [self.MASK] * mask_len +
                              tokens[mask_end:])
            if add_mask_start_end_markers:
                for mask_start, mask_end in mask_start_ends:
                    if apply_mask:
                        if collapse_mask:
                            mask_len = 1
                        else:
                            mention = ' '.join(tokens[mask_start:mask_end])
                            mention_subw = self.tokenize(mention)[1:-1]
                            mask_len = len(mention_subw)
                        mention = [self.MASK] * mask_len
                    else:
                        mention = tokens[mask_start:mask_end]
                    tokens = (tokens[:mask_start] + [self.BEGIN_MENTION] +
                              mention + [self.END_MENTION] + tokens[mask_end:])
                # account for inserted mention markers
                new_mask_starts = [
                    i for i, t in enumerate(tokens) if t == self.BEGIN_MENTION
                ]
                new_mask_ends = [
                    i + 1 for i, t in enumerate(tokens)
                    if t == self.END_MENTION
                ]
                mask_start_ends = list(
                    reversed(list(zip(new_mask_starts, new_mask_ends))))
        subwords = list(map(self.tokenizer.tokenize, tokens))
        subword_lengths = list(map(len, subwords))
        subwords = list(flatten(subwords))
        if add_special_symbols:
            subwords = self.add_special_symbols(subwords)
            offset = 1
            # + 1: assumes one special symbol is prepended to the input sequence
        else:
            offset = 0
        token_start_idxs = offset + np.cumsum([0] + subword_lengths[:-1])
        if mask_start_idx is not None:
            return subwords, token_start_idxs, mask_start_ends
        return subwords, token_start_idxs, None

    def subword_tokenize_to_ids(self,
                                tokens,
                                mask_start_idx=None,
                                mask_end_idx=None,
                                add_mask_start_end_markers=False,
                                collapse_mask=True,
                                apply_mask=True,
                                return_mask_mask=False,
                                return_mask_start_end=False,
                                max_len=None,
                                add_special_symbols=True):
        """Segment each token into subwords while keeping track of
        token boundaries and convert subwords into IDs.

        Parameters
        ----------
        tokens: A sequence of strings, representing input tokens.

        Returns
        -------
        A tuple consisting of:
            - A list of subword IDs, including IDs of the required
                special symbols.
            - A mask indicating padding tokens.
            - An array of indices into the list of subwords. See
                doc of subword_tokenize.
        """
        max_len = max_len or self.max_len
        subwords, token_start_idxs, mask_start_ends = self.subword_tokenize(
            tokens,
            mask_start_idx=mask_start_idx,
            mask_end_idx=mask_end_idx,
            add_mask_start_end_markers=add_mask_start_end_markers,
            collapse_mask=collapse_mask,
            apply_mask=apply_mask,
            add_special_symbols=add_special_symbols)
        subword_ids, padding_mask = self.convert_tokens_to_ids(subwords,
                                                               max_len=max_len)
        token_starts = torch.zeros(1, max_len).to(subword_ids)
        token_starts[0, token_start_idxs] = 1
        if return_mask_mask:
            mask_mask = torch.zeros(1, max_len).to(subword_ids)
            for mask_start, mask_end in mask_start_ends:
                token_mask_idxs = list(range(mask_start, mask_end))
                subw_mask_idxs = token_start_idxs[token_mask_idxs]
                mask_mask[0, subw_mask_idxs] = 1
            if return_mask_start_end:
                mask_start_end = torch.zeros(1, max_len).to(subword_ids)
                # this only works if there are fewer than seq_len // 2 masks
                for i, (mask_start, mask_end) in enumerate(mask_start_ends):
                    token_mask_idxs = list(range(mask_start, mask_end))
                    subw_mask_idxs = token_start_idxs[token_mask_idxs]
                    mask_start_end[0, 2 * i] = int(subw_mask_idxs[0])
                    mask_start_end[0, 2 * i + 1] = int(subw_mask_idxs[-1])
                return (subword_ids, padding_mask, token_starts, mask_mask,
                        mask_start_end)
            else:
                return subword_ids, padding_mask, token_starts, mask_mask
        return subword_ids, padding_mask, token_starts

    def segment_ids(self, segment1_len, segment2_len, pad=True, max_len=None):
        max_len = max_len or self.max_len
        npad = max_len - segment1_len - segment2_len
        ids = [0] * segment1_len + [1] * segment2_len + [0] * npad
        assert len(ids) == max_len
        return torch.tensor([ids]).to(device=self.device)
Exemplo n.º 30
0
def _intdef_ids():
    intdef_ids = Bunch([(x, i) for i, x in enumerate(
        lines(INTDEFS_CSV)) if x.strip()])
    return intdef_ids
Exemplo n.º 31
0
def _register_id_map():
    return Bunch([(x, i) for i, x in enumerate(lines(REGISTERS_CSV))])
Exemplo n.º 32
0
import argparse
import os
import util
import subprocess
import file_management

plp = file_management.get_playlists_path()
sgp = file_management.get_songs_path()

parser = argparse.ArgumentParser()
parser.add_argument('-p', '--playlist', type=str, metavar='<TITLE>')
args = parser.parse_args()

playlist = args.playlist

aspath = os.path.join(plp, f'{playlist}.playlist')

with open(aspath) as f:
    for line in util.lines(f):
        song_dest = os.path.join(sgp, f'{line}.m4a')
        # subprocess.run(['ffplay', '-nodisp', '-nostats', '-hide_banner', song_dest])
        subprocess.run(['afplay', song_dest])
Exemplo n.º 33
0
        sub_vecs = create_representation(
            "SVD", constants.SUBREDDIT_EMBEDDINGS.format(name))
        pos_seeds, neg_seeds = seeds.twitter_seeds()
        sub_vecs = sub_vecs.get_subembed(
            set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))
        pols = polarity_induction_methods.bootstrap(sub_vecs,
                                                    pos_seeds,
                                                    neg_seeds,
                                                    return_all=True,
                                                    nn=25,
                                                    beta=0.9,
                                                    num_boots=50,
                                                    n_procs=10)
        util.write_pickle(pols, POLARITIES + name + ".pkl")


if __name__ == "__main__":
    queue = Queue()
    id = int(sys.argv[1])
    valid_ids = set(range(250, 256))
    for i, line in enumerate(util.lines(NAMES)):
        if i in valid_ids:
            name = line.split()[0]
            queue.put(name)
    print queue.qsize()
    procs = [Process(target=worker, args=[i, queue]) for i in range(1)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()