def main():
    ss = Preferences()  # settings.1py

    directory_path = os.path.join(
        ss.get("maindir"), u"All-Torrs\\"
    )  # needs a unicode symbol so os. commands work at all on paths with funny chars

    files = [
        os.path.join(directory_path, fn) for fn in next(os.walk(directory_path))[2]
    ]  # gives absolute paths + names

    torrentnamelist = []
    for eachfile in files:
        with open(eachfile, "rb") as stringfile:
            try:
                torrent = bencode.decode(stringfile.read())
                for key, value in torrent.iteritems():
                    if key == "announce":
                        announce = value
                        domain = "{uri.netloc}".format(uri=urlparse(announce))
                        colon = domain.find(":", 0)
                        if colon != -1:
                            domain = domain[:colon]
                        if domain:
                            tracker = domain  # only using 1 value here(lazy)
                    elif key == "announce-list":
                        tracker = "Multiple Trackers"
            except:
                tracker = "None"
        torrentfilename = eachfile[eachfile.rfind("\\") + 1 :]

        if not os.path.exists(directory_path + tracker):
            os.makedirs(directory_path + tracker)
        os.rename(eachfile, os.path.join(directory_path + tracker + "\\" + torrentfilename))
示例#2
0
def main():

    ss = Preferences()

    torrentlist = bencode.decode_from_file(ss.get("utresumedat"))
    partiallist = [
    ]  # set up an empty container for desired data to get put into for later

    fileguarduseless = torrentlist.pop(".fileguard", None)
    rec = torrentlist.pop("rec", None)  #Remove this.
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup = {
        base64.b16encode(value["info"]):
        [value["path"], value["caption"], origkey]
        for origkey, value in torrentlist.iteritems()
    }
    for thehash, value in reverselookup.iteritems():
        partiallist.append([value[0], value[1], thehash])

    partiallist.sort()
    writelistfile = open(
        os.path.join(ss.get("maindir"), "TorrentList.txt"),
        'wb')  # write-out a text file with one entry per line.
    for eachline in partiallist:
        writelistfile.write(eachline[0] + " / " + eachline[1] + " / " +
                            eachline[2] + "\n")
        #path           /   #caption          /     #infohash
    writelistfile.close()
    print "Finished writing: TorrentList.txt"
def main():
    ss = Preferences()
    script1sourcedir = ss.getwpath("script1sourcedir")            #("seeding\")
    files = [os.path.join(script1sourcedir,filename) for filename in next(os.walk(script1sourcedir))[2]]        #gives absolute paths + names

    currentfile = 0

    container = []    #set up an empty container for desired data to get put into for later
    for eachfile in files:

        metainfo = decoder.decode_from_file(eachfile)
        # #need to manually SHA1 hash the torrent file's info-dict to get the info-hash
        infodict = metainfo[b'info']
        info_hash = hashlib.sha1(encode.encode(infodict)).hexdigest().upper()

        internalname = infodict[b'name']
        torrentfilename = eachfile[eachfile.rfind("\\")+1:]
        locextension = torrentfilename.find(".torrent")           #location of extension (char position)
        locid = torrentfilename.rfind("-")+1                      #location of torrentID (char position)
        torrentid = torrentfilename[locid:locextension]           #grab torrentID 
      
        container.append([torrentfilename, internalname, info_hash, torrentid])
        currentfile += 1 
        print(currentfile, torrentfilename.encode('ascii', errors='ignore').decode())        #console output is ascii only, cannot print unicode - chars are omitted

    #WRITE FILE 1
    writelistfile = codecs.open(ss.getwpath("outpath1"),'wb',"utf-8") # write-out a text file with torrentID and Hash (on one line) ("1seeding_ID+Hash+Filename.txt")
    for eachline in container:
        writelistfile.write(eachline[3] + " / " + eachline[2] + " / " + eachline[0] + "\n")     #output torrentID / Hash / torrentfilename
    writelistfile.close()
示例#4
0
def main():

    ss = Preferences()

    torrentlist = bencode.decode_from_file(ss.get("utresumedat"))
    partiallist = []    # set up an empty container for desired data to get put into for later

    fileguarduseless = torrentlist.pop(b".fileguard",None)
    rec = torrentlist.pop(b"rec",None)   #Remove this. 
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup={base64.b16encode(value[b"info"]):[value[b"path"],value[b"caption"],origkey] for origkey,value in torrentlist.items()}
    for thehash,value in reverselookup.items():
        partiallist.append([value[0].decode('utf-8'),value[1].decode('utf-8'),thehash.decode('utf-8')])
    #Those 3 lines replace all of this:
    # for key,value in torrentlist.items():
    #     sentinel = False    # reset before each while-loop
    #     if b"path" in value:
    #         path = value[b"path"].decode('utf-8')
    #     if b"caption" in value:
    #         caption = value[b"caption"].decode('utf-8')
    #     if b"info" in value:
    #         infoHash = base64.b16encode(value[b"info"]).decode('utf-8')
    #         sentinel = True  # need this because theres other dictionaries INside each file-entries' dict...
    #                          # and this will trigger the partiallist.append to write only file-entry dicts.
    #     if sentinel == True:
    #         partiallist.append([path,caption,infoHash])

    partiallist.sort()
    writelistfile = open(os.path.join(ss.get("maindir"),"TorrentList.txt"),'w',encoding='utf-8') # write-out a text file with one entry per line.
    for eachline in partiallist:
        writelistfile.write(eachline[0] + " / " + eachline[1] + " / " + eachline[2] + "\n")
        					#path 			/	#caption		  /		#infohash
    writelistfile.close()
def main():

    ss = Preferences()
    directory_path = ss.getwpath("script3destdir")  #("hash-grabs-as-filenames" dir)
    allfiles = [os.path.join(directory_path,fn) for fn in next(os.walk(directory_path))[2]]        #gives absolute paths + names

    writelistfile = codecs.open(ss.getwpath("outpath3"), 'wb', "utf-8") #("3propernames.txt" file)
    for hashidfilename in allfiles:  #iterate through filenames of what.cd JSON data
        with open(hashidfilename,'r') as stringfile: #open them
            response = json.load(stringfile)
            torrentHash= response["torrent"]["infoHash"]     #grab the hash To compare.            
            writelistfile.write(hashidfilename[hashidfilename.rfind("\\")+1:] + " / " + torrentHash + "\n")            #File Output. The Master List file of the names and hashes.
    writelistfile.close()
def main():

    ss = Preferences()

    torrentlist = bencode.decode_from_file(ss.get("utresumedat"))
    partiallist = []    # set up an empty container for desired data to get put into for later

    fileguarduseless = torrentlist.pop(".fileguard",None)
    rec = torrentlist.pop("rec",None)   #Remove this. 
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup={base64.b16encode(value["info"]):[value["path"],value["caption"],origkey] for origkey,value in torrentlist.iteritems()}
    for thehash,value in reverselookup.iteritems():
        partiallist.append([value[0],value[1],thehash])

    partiallist.sort()
    writelistfile = open(os.path.join(ss.get("maindir"),"TorrentList.txt"),'wb') # write-out a text file with one entry per line.
    for eachline in partiallist:
        writelistfile.write(eachline[0] + " / " + eachline[1] + " / " + eachline[2] + "\n")
                            #path           /   #caption          /     #infohash
    writelistfile.close()
    print "Finished writing: TorrentList.txt"
def main():
    ss = Preferences()  #settings.1py

    directory_path = os.path.join(
        ss.get("maindir"), u"All-Torrs\\"
    )  #needs a unicode symbol so os. commands work at all on paths with funny chars

    files = [
        os.path.join(directory_path, fn)
        for fn in next(os.walk(directory_path))[2]
    ]  #gives absolute paths + names

    torrentnamelist = []
    for eachfile in files:
        with open(eachfile, 'rb') as stringfile:
            try:
                torrent = bencode.decode(stringfile.read())
                for key, value in torrent.iteritems():
                    if key == "announce":
                        announce = value
                        domain = '{uri.netloc}'.format(uri=urlparse(announce))
                        colon = domain.find(':', 0)
                        if colon != -1:
                            domain = domain[:colon]
                        if domain:
                            tracker = domain  #only using 1 value here(lazy)
                    elif key == "announce-list":
                        tracker = "Multiple Trackers"
            except:
                tracker = "None"
        torrentfilename = eachfile[eachfile.rfind("\\") + 1:]

        if not os.path.exists(directory_path + tracker):
            os.makedirs(directory_path + tracker)
        os.rename(
            eachfile,
            os.path.join(directory_path + tracker + "\\" + torrentfilename))
示例#8
0
def main():
    ss = Preferences()

    currentline = 0  #to resume a broken download. set this to the last SUCCESSFUL number (due to 1 starting at 0) that you see was outputted to console
    try:
        cookies = pickle.load(
            open(ss.getwpath("cookiesfile"),
                 'rb'))  #cookies speed up the HTTP (supposedly)
    except:
        cookies = None  #if we cant load it, don't use it.
    credentials = open(ss.getwpath("credentialsfile"), 'rb').readlines(
    )  #store credentials in another file and .git-ignore it
    username = credentials[0].strip()
    password = credentials[1].strip()

    apihandle = whatapi.WhatAPI(config_file=None,
                                username=username,
                                password=password,
                                cookies=cookies)

    filenamewithIDs = ss.getwpath(
        "outpath1")  # ("1seeding_ID+Hash+Filename.txt")
    hashdir = ss.getwpath("script2destdir")  #output dir

    openedfile = open(filenamewithIDs, 'r', encoding='utf-8').readlines()
    for eachline in islice(openedfile, currentline,
                           None):  #will continue where it left off
        idandhash = eachline.strip().split(' / ')
        currentID = idandhash[0]
        currentHash = idandhash[1]
        if not os.path.exists(os.path.join(hashdir, currentHash)):
            #currentHash = "E7A5718EC52633FCCB1EA85656AA0622543994D7"   #test hash for debugging
            try:
                response = apihandle.request(1.75, "torrent", id=currentID)[
                    "response"]  #talk to server and receive a response
            except whatapi.RequestException as e:
                currentline += 1
                print(currentline,
                      " ERROR. Your search did not match anything.")
                continue
            with open(os.path.join(hashdir, currentHash), 'w') as outfile:
                json.dump(response, outfile, sort_keys=True)
            currentline += 1
            print(currentline, ": ", currentID)

    pickle.dump(apihandle.session.cookies,
                open(ss.getwpath("cookiesfile"),
                     'wb'))  #store cookies when script ends, for next-run.
    print("Download Complete.")
def main():
    ss = Preferences()

    currentline = (
        0
    )  # to resume a broken download. set this to the last SUCCESSFUL number (due to 1 starting at 0) that you see was outputted to console

    try:
        cookies = pickle.load(open(ss.getwpath("cookiesfile"), "rb"))  # cookies speed up the HTTP (supposedly)
    except:
        cookies = None  # if we cant load it, don't use it.
    credentials = open(
        ss.getwpath("credentialsfile"), "rb"
    ).readlines()  # store credentials in another file and .git-ignore it
    username = credentials[0].strip()
    password = credentials[1].strip()

    apihandle = whatapi.WhatAPI(config_file=None, username=username, password=password, cookies=cookies)

    filenamewithIDs = ss.getwpath("outpath1")  # ("1seeding_ID+Hash+Filename.txt")
    hashdir = ss.getwpath("script2destdir")  # output dir

    openedfile = open(filenamewithIDs, "r").readlines()
    for eachline in islice(openedfile, currentline, None):  # will continue where it left off
        idandhash = eachline.strip().split(" / ")
        currentID = idandhash[0]
        currentHash = idandhash[1]
        if not os.path.exists(os.path.join(hashdir, currentHash)):
            # currentHash = "E7A5718EC52633FCCB1EA85656AA0622543994D7"   #test hash for debugging
            try:
                response = apihandle.request(0, "torrent", hash=currentHash)[
                    "response"
                ]  # talk to server and receive a response. the 0 means time.sleep(0).
            except whatapi.RequestException as e:
                currentline += 1
                print currentline, " ERROR. Your search did not match anything."
                continue
            with open(os.path.join(hashdir, currentHash), "w") as outfile:
                json.dump(response, outfile, sort_keys=True)
            currentline += 1
            print currentline, ": ", currentHash

    pickle.dump(
        apihandle.session.cookies, open(ss.getwpath("cookiesfile"), "wb")
    )  # store cookies when script ends, for next-run.
    print "Download Complete."
示例#10
0
def main():
    ss = Preferences()
    script1sourcedir = ss.getwpath(
        u"script1sourcedir"
    ) + u''  #("seeding\"), needs unicode u for file opening.
    files = [
        os.path.join(script1sourcedir, filename)
        for filename in next(os.walk(script1sourcedir))[2]
    ]  #gives absolute paths + names

    currentfile = 0

    container = [
    ]  #set up an empty container for desired data to get put into for later
    for eachfile in files:

        metainfo = bencode.decode_from_file(eachfile)
        # #need to manually SHA1 hash the torrent file's info-dict to get the info-hash
        infodict = metainfo['info']
        info_hash = hashlib.sha1(bencode.bencode(infodict)).hexdigest().upper()

        internalname = infodict['name']
        torrentfilename = eachfile[eachfile.rfind("\\") + 1:]
        locextension = torrentfilename.find(
            ".torrent")  #location of extension (char position)
        locid = torrentfilename.rfind(
            "-") + 1  #location of torrentID (char position)
        torrentid = torrentfilename[locid:locextension]  #grab torrentID

        torrentfilename = torrentfilename[:locid - 1]

        #####-------------replace banned characters with unicode section-----------------######
        ###
        # Forward slashes are strange. "FullWidth" is very wide and would be too wide if theres already spaces around it.
        torrentfilename = torrentfilename.replace(
            " / ", u"/")  # U+FFOF  (wide)       FULLWIDTH SOLIDUS
        # "Division" slash is too narrow and needs spaces inserted surrounding it (and is still less width than the fullwidth)
        torrentfilename = torrentfilename.replace(
            "/", u" ∕ ")  # U+2215  (narrow)     DIVISION SLASH
        # Backslash (requires two slashes in python)
        torrentfilename = torrentfilename.replace(
            "\\", u"\")  # U+FF3C               FULLWIDTH REVERSE SOLIDUS
        # Colon
        torrentfilename = torrentfilename.replace(
            ":", u"꞉")  # U+A789               MODIFIER LETTER COLON
        # asterisk
        torrentfilename = torrentfilename.replace(
            "*", u"※")  # U+203B               REFERENCE MARK
        # question mark (replacement is backwards, sorry)
        torrentfilename = torrentfilename.replace(
            "?", u"؟")  # U+061F               ARABIC QUESTION MARK
        # Double-quote
        torrentfilename = torrentfilename.replace(
            '"', u"ʺ")  # U+02BA               MODIFIER LETTER DOUBLE PRIME
        # Left angle bracket
        torrentfilename = torrentfilename.replace(
            "<", u"˂")  # U+02C2               MODIFIER LETTER LEFT ARROWHEAD
        # right angle bracket
        torrentfilename = torrentfilename.replace(
            ">", u"˃")  # U+02C3               MODIFIER LETTER RIGHT ARROWHEAD
        # Pipe
        torrentfilename = torrentfilename.replace(
            "|", u"ǀ")  # U+01C0               LATIN LETTER DENTAL CLICK
        ###
        #####----------windows filename banned chars replacement with unicode-----------######

        container.append([torrentfilename, internalname, info_hash, torrentid])
        currentfile += 1
        print currentfile, torrentfilename.encode('ascii', errors='ignore')

    print "\nReminder: Console output is ascii only, Cannot Print Unicode. (chars omitted)"
    ##File Output. The Master List file of everything.##
    # when the loop exits, Sort it, and write it to the file.
    container.sort()
    writelistfile = codecs.open(
        ss.getwpath("outpath3"), 'wb', "utf-8"
    )  # write-out a text file with one entry per line. main output file (3propernames.txt)
    for eachline in container:
        writelistfile.write(eachline[0] + " / " + eachline[2] +
                            "\n")  #torrentname  / infohash
    writelistfile.close()
    print "Completed. Unicode File Written to: ", os.path.basename(
        ss.getwpath("outpath3"))
示例#11
0
    def _calc_terminal_scores(self, w):
        """ Calculate the score for each possible terminal/token match """

        # First pass: for each token, find the possible terminals that
        # can correspond to that token
        finals = defaultdict(set)
        tokens = dict()
        self._find_options(w, finals, tokens)

        # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token
        scores = dict()

        # Loop through the indices of the tokens spanned by this tree
        for i in range(w.start, w.end):

            s = finals[i]
            # Initially, each alternative has a score of 0
            scores[i] = {terminal: 0 for terminal in s}

            #print("Reducing token '{0}'; scores dict initialized to:\n{1}".format(tokens[i].t1, scores[i]))

            if len(s) <= 1:
                # No ambiguity to resolve here
                continue

            # More than one terminal in the option set for the token at index i
            # Calculate the relative scores
            # Find out whether the first part of all the terminals are the same
            same_first = len(set(terminal.first for terminal in s)) == 1
            txt = tokens[i].lower
            # No need to check preferences if the first parts of all possible terminals are equal
            # Look up the preference ordering from Reynir.conf, if any
            prefs = None if same_first else Preferences.get(txt)
            found_pref = False
            sc = scores[i]
            if prefs:
                adj_worse = defaultdict(int)
                adj_better = defaultdict(int)
                for worse, better, factor in prefs:
                    for wt in s:
                        if wt.first in worse:
                            for bt in s:
                                if wt is not bt and bt.first in better:
                                    if bt.name[0] in "\"'":
                                        # Literal terminal: be even more aggressive in promoting it
                                        adj_w = -2 * factor
                                        adj_b = +6 * factor
                                    else:
                                        adj_w = -2 * factor
                                        adj_b = +4 * factor
                                    adj_worse[wt] = min(adj_worse[wt], adj_w)
                                    adj_better[bt] = max(adj_better[bt], adj_b)
                                    found_pref = True
                for wt, adj in adj_worse.items():
                    #print("Token '{2}': Adjusting score of terminal '{0}' by {1}".format(wt, adj, txt))
                    sc[wt] += adj
                for bt, adj in adj_better.items():
                    #print("Token '{2}': Adjusting score of terminal '{0}' by {1}".format(bt, adj, txt))
                    sc[bt] += adj
            #if not same_first and not found_pref:
            #    # Only display cases where there might be a missing pref
            #    print("Token '{0}' has {1} possible terminal matches: {2}".format(txt, len(s), s))

            # Apply heuristics to each terminal that potentially matches this token
            for t in s:
                tfirst = t.first
                if tfirst == "ao" or tfirst == "eo":
                    # Subtract from the score of all ao and eo
                    sc[t] -= 1
                elif tfirst == "no":
                    if t.is_singular:
                        # Add to singular nouns relative to plural ones
                        sc[t] += 1
                    elif t.is_abbrev:
                        # Punish abbreviations in favor of other more specific terminals
                        sc[t] -= 1
                elif tfirst == "fs":
                    if t.has_variant("nf"):
                        # Reduce the weight of the 'artificial' nominative prepositions
                        # 'næstum', 'sem', 'um'
                        sc[t] -= 5  # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5)
                    elif txt == "við" and t.has_variant("þgf"):
                        sc[t] += 1  # Smaller bonus for við + þgf (is rarer than við + þf)
                    elif txt == "sem" and t.has_variant("þf"):
                        sc[t] -= 6  # Even less attractive than sem_nf
                    else:
                        # Else, give a bonus for each matched preposition
                        sc[t] += 2
                elif tfirst == "so":
                    if t.variant(0) in "012":
                        # Consider verb arguments
                        # Normally, we give a bonus for verb arguments: the more matched, the better
                        numcases = int(t.variant(0))
                        adj = 2 * numcases
                        # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd'
                        if numcases == 0:
                            # Zero arguments: we might not like this
                            if all((m.stofn not in VerbObjects.VERBS[0]) and (
                                    "MM" not in m.beyging)
                                   for m in tokens[i].t2 if m.ordfl == "so"):
                                # No meaning where the verb has zero arguments
                                adj = -5
                        # Apply score adjustments for verbs with particular object cases,
                        # as specified by $score(n) pragmas in Verbs.conf
                        # In the (rare) cases where there are conflicting scores,
                        # apply the most positive adjustment
                        adjmax = 0
                        for m in tokens[i].t2:
                            if m.ordfl == "so":
                                key = m.stofn + t.verb_cases
                                score = VerbObjects.SCORES.get(key)
                                if score is not None:
                                    adjmax = score
                                    break
                        sc[t] += adj + adjmax
                    if t.is_sagnb:
                        # We like sagnb and lh, it means that more
                        # than one piece clicks into place
                        sc[t] += 6
                    elif t.is_lh:
                        # sagnb is preferred to lh, but vb (veik beyging) is discouraged
                        if t.has_variant("vb"):
                            sc[t] -= 2
                        else:
                            sc[t] += 3
                    elif t.is_mm:
                        # Encourage mm forms. The encouragement should be better than
                        # the score for matching a single case, so we pick so_0_mm
                        # rather than so_1_þgf, for instance.
                        sc[t] += 3
                    elif t.is_vh:
                        # Encourage vh forms
                        sc[t] += 2
                    if t.is_subj:
                        # Give a small bonus for subject matches
                        if t.has_variant("none"):
                            # ... but a punishment for subj_none
                            sc[t] -= 3
                        else:
                            sc[t] += 1
                    if t.is_nh:
                        if (i > 0) and any(pt.first == 'nhm'
                                           for pt in finals[i - 1]):
                            # Give a bonus for adjacent nhm + so_nh terminals
                            sc[t] += 4  # Prop up the verb terminal with the nh variant
                            for pt in scores[i - 1].keys():
                                if pt.first == 'nhm':
                                    # Prop up the nhm terminal
                                    scores[i - 1][pt] += 2
                                    # print("Propping up nhm for verb {1}, score is now {0}".format(scores[i-1][pt], tokens[i].t1))
                                    break
                        if any(pt.first == "no" and pt.has_variant("ef")
                               and pt.is_plural for pt in s):
                            # If this is a so_nh and an alternative no_ef_ft exists, choose this one
                            # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða')
                            sc[t] += 4
                elif tfirst == "tala" or tfirst == "töl":
                    # A complete 'töl' or 'no' is better (has more info) than a rough 'tala'
                    if tfirst == "tala":
                        sc[t] -= 1
                    # Discourage possessive ('ef') meanings for numbers
                    for pt in s:
                        if (pt.first == "no"
                                or pt.first == "töl") and pt.has_variant("ef"):
                            sc[pt] -= 1
                elif tfirst == "sérnafn":
                    if not tokens[i].t2:
                        # If there are no BÍN meanings, we had no choice but to use sérnafn,
                        # so alleviate some of the penalty given by the grammar
                        sc[t] += 2
                    else:
                        # BÍN meanings are available: discourage this
                        #print("sérnafn '{0}': BÍN meanings available, discouraging".format(tokens[i].t1))
                        sc[t] -= 6
                        if i == w.start:
                            # First token in sentence, and we have BÍN meanings:
                            # further discourage this
                            sc[t] -= 4
                        #print("Meanings for sérnafn {0}:".format(tokens[i].t1))
                        #for m in tokens[i].t2:
                        #    print("{0}".format(m))
                    #        if m.stofn[0].isupper():
                    #            sc[t] -= 4 # Discourage 'sérnafn' if an uppercase BÍN meaning is available
                    #            break
                elif t.name[0] in "\"'":
                    # Give a bonus for exact or semi-exact matches
                    sc[t] += 1

        #for i in range(w.start, w.end):
        #    print("At token '{0}' scores dict is:\n{1}".format(tokens[i].t1, scores[i]))
        return scores
示例#12
0
文件: reducer.py 项目: halldor/Reynir
    def go_with_score(self, forest):

        """ Returns the argument forest after pruning it down to a single tree """

        if forest is None:
            return (None, 0)
        w = forest

        # First pass: for each token, find the possible terminals that
        # can correspond to that token
        finals = defaultdict(set)
        tokens = dict()
        self._find_options(w, finals, tokens)

        # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token
        scores = dict()

        # Loop through the indices of the tokens spanned by this tree
        for i in range(w.start, w.end):

            s = finals[i]
            # Initially, each alternative has a score of 0
            scores[i] = { terminal: 0 for terminal in s }
            if len(s) > 1:
                # More than one terminal in the option set
                # Calculate the relative scores
                # Find out whether the first part of all the terminals are the same
                same_first = len(set(x.first for x in s)) == 1
                txt = tokens[i].lower
                # No need to check preferences if the first parts of all possible terminals are equal
                # Look up the preference ordering from Reynir.conf, if any
                prefs = None if same_first else Preferences.get(txt)
                found_pref = False
                sc = scores[i]
                if prefs:
                    for worse, better, factor in prefs:
                        for wt in s:
                            if wt.first in worse:
                                for bt in s:
                                    if wt is not bt and bt.first in better:
                                        if bt.name[0] in "\"'":
                                            # Literal terminal: be even more aggressive in promoting it
                                            sc[wt] -= 2 * factor
                                            sc[bt] += 6 * factor
                                        else:
                                            sc[wt] -= 2 * factor
                                            sc[bt] += 4 * factor
                                        found_pref = True
                #if not same_first and not found_pref:
                #    # Only display cases where there might be a missing pref
                #    print("Token '{0}' has {1} possible terminal matches: {2}".format(txt, len(s), s))

                # Apply heuristics to each terminal that potentially matches this token
                for t in s:
                    tfirst = t.first
                    if tfirst == "ao" or tfirst == "eo":
                        # Subtract from the score of all ao and eo
                        sc[t] -= 1
                    elif tfirst == "no":
                        if t.is_singular:
                            # Add to singular nouns relative to plural ones
                            sc[t] += 1
                        elif t.is_abbrev:
                            # Punish abbreviations in favor of other more specific terminals
                            sc[t] -= 1
                    elif tfirst == "fs":
                        if t.has_variant("nf"):
                            # Reduce the weight of the 'artificial' nominative prepositions
                            # 'næstum', 'sem', 'um'
                            sc[t] -= 3 # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5)
                        else:
                            # Else, give a bonus for each matched preposition
                            sc[t] += 2
                    elif tfirst == "so":
                        if t.variant(0) in "012":
                            # Consider verb arguments
                            # Normally, we give a bonus for verb arguments: the more matched, the better
                            adj = 2 * int(t.variant(0))
                            # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd'
                            if adj == 0:
                                # Zero arguments: we might not like this
                                for m in tokens[i].t2:
                                    if m.ordfl == "so" and m.stofn not in VerbObjects.VERBS[0]:
                                        # We're using a verb with zero arguments but that form is not
                                        # explicitly listed in Verbs.conf: discourage this
                                        # print("Discouraging zero-arg use of verb '{0}' (stem '{1}')".format(txt, m.stofn))
                                        adj = -1
                                        break
                            sc[t] += adj
                        if t.is_sagnb:
                            # We like sagnb and lh, it means that more
                            # than one piece clicks into place
                            sc[t] += 4
                        elif t.is_lh:
                            # sagnb is preferred to lh, but vb (veik beyging) is discouraged
                            if t.has_variant("vb"):
                                sc[t] -= 2
                            else:
                                sc[t] += 3
                        if t.is_subj:
                            # Give a small bonus for subject matches
                            if t.has_variant("none"):
                                # ... but a punishment for subj_none
                                sc[t] -= 3
                            else:
                                sc[t] += 1
                        if t.is_nh:
                            if (i > 0) and any(pt.first == 'nhm' for pt in finals[i - 1]):
                                # Give a bonus for adjacent nhm + so_nh terminals
                                sc[t] += 2 # Prop up the verb terminal with the nh variant
                                for pt in scores[i - 1].keys():
                                    if pt.first == 'nhm':
                                        # Prop up the nhm terminal
                                        scores[i - 1][pt] += 2
                            if any(pt.first == "no" and pt.has_variant("ef") and pt.is_plural for pt in s):
                                # If this is a so_nh and an alternative no_ef_ft exists, choose this one
                                # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða')
                                sc[t] += 2
                    elif tfirst == "tala" or tfirst == "töl":
                        # A complete 'töl' or 'no' is better (has more info) than a rough 'tala'
                        if tfirst == "tala":
                            sc[t] -= 1
                        # Discourage possessive ('ef') meanings for numbers
                        for pt in s:
                            if (pt.first == "no" or pt.first == "töl") and pt.has_variant("ef"):
                                sc[pt] -= 1
                    elif tfirst == "sérnafn":
                        if tokens[i].t2:
                            sc[t] -= 20 # Base penalty is -20
                            for m in tokens[i].t2:
                                sc[t] -= 1 # Subtract one for each BÍN meaning available
                                if m.stofn[0].isupper():
                                    sc[t] -= 8 # Heavily discourage 'sérnafn' if an uppercase BÍN meaning is available
                    elif t.name[0] in "\"'":
                        # Give a bonus for exact or semi-exact matches
                        sc[t] += 1

        # Third pass: navigate the tree bottom-up, eliminating lower-rated
        # options (subtrees) in favor of higher rated ones

        score = self._reduce(w, scores)

        return (w, score)
示例#13
0
def main():

    global fmttdcatalogueNumber  #to fix an issue with scope (line 330,333).

    ss = Preferences()
    hashtorrlistfile = ss.getwpath(
        "outpath1")  #("1seeding_ID+Hash+Filename.txt")
    directory_path = ss.getwpath("script2destdir")  #as source dir (hash-grabs)
    allfiles = [
        os.path.join(directory_path, filename)
        for filename in next(os.walk(directory_path))[2]
    ]  # gives absolute paths + names

    hashtofilenamefolder = ss.getwpath(
        "script3destdir")  #as dest dir (hash-grabs-as-filenames)
    writelistfile = codecs.open(
        ss.getwpath("outpath3"), 'wb', "utf-8"
    )  # write-out a text file with one entry per line. main output file (3propernames.txt)

    writelistcontainer = []
    currentfilenumber = 1

    for hashidfilename in allfiles:  # iterate through filenames of what.cd JSON data

        with open(hashidfilename, 'r') as stringfile:  # open them
            needFixLabeltoNewEdition = False

            jsonresponse = json.load(stringfile)
            tor = Torrent(jsonresponse)

            if tor.group.categoryName != "Music":
                continue  # do not continue altering any non-music torrents.

            releaseTypeName = ReleaseType(
                tor.group.releaseType
            ).name  # turn int. value into a string using the enum class above
            fmttdreleaseTypeName = "[" + releaseTypeName + "]"

            if tor.torrent.remastered:
                if tor.torrent.remasterTitle:
                    tor.group.name += " (" + tor.torrent.remasterTitle + ")"
                if tor.torrent.remasterYear > tor.group.year:
                    tor.group.year = tor.torrent.remasterYear
                if tor.torrent.remasterRecordLabel:
                    if tor.group.recordLabel.lower(
                    ) != tor.torrent.remasterRecordLabel.lower(
                    ):  # so not case sensitive
                        if not tor.group.recordLabel:
                            tor.group.recordLabel = tor.torrent.remasterRecordLabel
                        else:
                            # then things get complicated and we need to figure out which Label/catalog field is the best one to use, or combine them or both
                            needFixLabeltoNewEdition = True

                # if its been determined that its a remaster (new edition), process new label and catalog
                #  checking whether to combine with old, or which one to use, etc, etc.....
                if needFixLabeltoNewEdition == True:

                    score = difflib.SequenceMatcher(
                        None, tor.group.recordLabel.lower(),
                        tor.torrent.remasterRecordLabel.lower()).ratio()
                    if (score < 0.5):
                        # considered similar at 0.6 but this way is not that accurate. if they are lower than 0.5 similar, just use the new one
                        tor.group.recordLabel = tor.torrent.remasterRecordLabel
                        if tor.torrent.remasterCatalogueNumber:
                            tor.group.catalogueNumber = tor.torrent.remasterCatalogueNumber  # if tor.torrent.remasterCatalogueNumber is not blank, use it as the new tor.group.catalogueNumber

                    elif all(word in tor.torrent.remasterRecordLabel.lower() for word in tor.group.recordLabel.lower()) \
                            or \
                            all(word in tor.group.recordLabel.lower() for word in tor.torrent.remasterRecordLabel.lower()):
                        # If all the words in the old label is encompassed in the new one, use the new one.
                        # This would mean the new edition record label is most likely longer and is similar enough to use that,
                        # and preferred, since its more applicable to this specific release anyway.
                        # Even if the reverse is true, this code-block should only catch labels that differ in slight ways.(?)

                        # example 1: originallabel={Big Beat Records}  remasterlabel={Big Beat}          #elif new label in old label
                        #   result:     {Big Beat}                                                      #or
                        # example 2: originallabel={Big Beat}  remasterlabel={Big Beat Records}          #if old label in new label
                        #   result:     {Big Beat Records}                                              #then
                        tor.group.recordLabel = tor.torrent.remasterRecordLabel  # always choose new label.
                        if tor.torrent.remasterCatalogueNumber:
                            tor.group.catalogueNumber = tor.torrent.remasterCatalogueNumber  # if tor.torrent.remasterCatalogueNumber is not blank, use it as the new tor.group.catalogueNumber
                    else:
                        # This else-block is used when the above are not true, and we can't decide on which one to use, so we use both. Combined.

                        # example 1: originallabel={Island Records}  remasterlabel={Island Records / Lokal Legend}
                        #   result:     {Island Records / Lokal Legend}
                        # example 2: originallabel {Wall Recordings}  remasterlabel={Tiger Records}
                        #   result:     {Wall Recordings / Tiger Records}
                        new = ""
                        splitorig = re.sub(
                            "[(,),-]", " ", tor.group.recordLabel).split(
                            )  # remove delimiter chars that mess up stuff
                        sepchar = ["/"]
                        splitnew = re.sub(
                            "[(,),-]", " ", tor.torrent.remasterRecordLabel
                        ).split()  # turn everything into a list

                        new = " ".join([
                            "%s" % (v)
                            for v in getUniqueWords(splitorig + sepchar +
                                                    splitnew)
                        ])  # append unique words to the orig.

                        if tor.torrent.remasterCatalogueNumber:
                            if tor.group.catalogueNumber != tor.torrent.remasterCatalogueNumber:
                                if tor.group.catalogueNumber:
                                    tor.group.catalogueNumber += " / " + tor.torrent.remasterCatalogueNumber
                                else:
                                    tor.group.catalogueNumber = tor.torrent.remasterCatalogueNumber
                        tor.group.recordLabel = new

            # ntpath.basename was really slow so doing it manually.... (32 times faster)= 0.128 seconds vs 0.004 seconds
            # whats happening here is due to an exponential nested for loop, ie: 4129 results^2 = 17 million function calls of either basename or .rfind('\\')
            # there should be a better way to do this.
            hashfilesepidloc = hashidfilename.rfind("\\") + 1
            cmphashfn = hashidfilename[hashfilesepidloc:]
            iterhashfile = open(
                hashtorrlistfile,
                'rb').readlines()  # read everything into memory
            for i in iterhashfile:  # read line
                splitline = i.strip().split(
                    ' / ')  # 0 torrentID / 1 Hash / 2 torrentfilename
                if splitline[
                        1] == cmphashfn:  # if it matches, start processing
                    newEntry = TorrentEntry()  # instanciate class
                    newEntry.hash = splitline[1]  # store Hash for reference
                    newEntry.pathname = splitline[2].decode(
                        "utf-8")  # filename + extension
                    locextension = newEntry.pathname.find(
                        ".torrent")  # location of extension
                    locid = newEntry.pathname.rfind(
                        "-") + 1  # location of tor.torrent.id
                    newEntry.filename = newEntry.pathname[:
                                                          locextension]  # chop the extension off (manually)
                    newEntry.artistalbum = newEntry.filename[:locid -
                                                             1]  # JUST the name (no ID#)
                    newEntry.torrentid = newEntry.filename[
                        locid:
                        locextension]  # grab ID for future reference (tor.torrent.id on what.cd)
                    # example : S-Type - Billboard (Lido Remix) - 2014 (WEB - MP3 - 320)
                    newEntry.artist = newEntry.artistalbum[:newEntry.
                                                           artistalbum.find(
                                                               " - "
                                                           )]  # grab artist
                    tempalbum = newEntry.artistalbum[
                        newEntry.artistalbum.find(" - ") +
                        3:]  # temp value helps with string processing
                    newEntry.album = tempalbum[:tempalbum.find(
                        " - "
                    )]  # not needed since it can be pulled from [group]
                    newEntry.year = tempalbum[
                        tempalbum.find(" - ") + 3:tempalbum.find(" - ") +
                        7]  # not needed since it can be pulled from [group]

                    # ------------Recreate name------------#
                    # -------Special RULES SECTION---------#
                    newEntry.createdpropername = newEntry.artist + u" - " + tor.group.name + " "
                    if tor.group.releaseType > 1:  # dont put it for Album or Unspecified
                        if tor.group.releaseType != 5:  # do something different for EP
                            newEntry.createdpropername += fmttdreleaseTypeName + " "
                        else:  # make a rule so [EP] doesnt come up if there is " EP " already
                            if tor.group.name[-2:] != "EP":
                                newEntry.createdpropername += fmttdreleaseTypeName + " "

                    newEntry.createdpropername += "(" + str(
                        tor.group.year) + ")"

                    # written like this for easy humanreading
                    #           format = MP3, FLAC, AAC,
                    #          media = cd, web, vinyl, soundboard, dat
                    #        encoding = lossless,320,v0,256,v2,192
                    if tor.torrent.format == "FLAC":
                        newEntry.fmttdMediaEncodeFormat = "FLAC"
                        # log and logscore only applicable to flac.
                        if tor.torrent.hasLog:
                            newEntry.fmttdMediaEncodeFormat += " " + str(
                                tor.torrent.logScore
                            ) + "%"  # the % implies "log" so leave out the word log
                    if tor.torrent.format == "AAC":
                        if (any("itunes" in word.lower()
                                for word in tor.torrent.description.split())
                            ) or (any(
                                "itunes" in word.lower()
                                for word in tor.torrent.filePath.split())):
                            newEntry.fmttdMediaEncodeFormat = "iTunes "
                        newEntry.fmttdMediaEncodeFormat += "AAC"
                    if tor.torrent.format == "MP3":
                        # dont actually write mp3
                        # only write Scene or WEB if it is an mp3
                        if tor.torrent.scene:
                            newEntry.fmttdMediaEncodeFormat += "Scene"
                        elif tor.torrent.media == "WEB":
                            newEntry.fmttdMediaEncodeFormat += "WEB"
                        else:
                            newEntry.fmttdMediaEncodeFormat += tor.torrent.media
                        if "VBR" in tor.torrent.encoding:
                            newEntry.fmttdMediaEncodeFormat += " " + tor.torrent.encoding[:
                                                                                          2]
                        else:
                            newEntry.fmttdMediaEncodeFormat += " " + tor.torrent.encoding

                    newEntry.fmttdMediaEncodeFormat = "[" + newEntry.fmttdMediaEncodeFormat + "]"

                    newEntry.createdpropername += " " + newEntry.fmttdMediaEncodeFormat

                    # put catalog number in brackets
                    if tor.group.catalogueNumber:
                        fmttdcatalogueNumber = ("[" +
                                                tor.group.catalogueNumber +
                                                "]").replace(" ", "").upper()
                    elif tor.group.recordLabel:
                        fmttdcatalogueNumber = "[" + tor.group.recordLabel + "]"  # combines with next part to put recordLabel in the front if Cat# missing

                    if any(word in tor.group.recordLabel.lower() for word in
                           Sorted_Record_Labels_List):  # so not case sensitive
                        newEntry.createdpropername = fmttdcatalogueNumber + " " + newEntry.createdpropername
                    elif tor.group.recordLabel:
                        newEntry.createdpropername += " " + "{" + tor.group.recordLabel + "}"
                        # if tor.group.catalogueNumber:                                                       #This will put [CATA###] after all releases, even labels not in your list
                        #     newEntry.createdpropername += " " + fmttdcatalogueNumber              #Gets kind of cumbersome for me. (uncomment to use it anyway)

                    # these 2 lines are a quick fix, for an oversight in my naming process
                    # if these are single file .mp3's (or a single .flac) they will need a .mp3 at the end of the filename
                    if not tor.torrent.filePath:
                        newEntry.createdpropername += "." + tor.torrent.format.lower(
                        )

                    try:
                        print currentfilenumber, newEntry.createdpropername.encode(
                            'ascii', errors='ignore')
                    except:
                        print "COULD NOT PRINT UNICODE FILENAME TO CONSOLE. HASH=", tor.torrent.infoHash

                    ########-------------replace characters section----------------#########
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        "\\",
                        u"\")  # U+FF3C               FULLWIDTH REVERSE SOLIDUS
                    # these forward slashes are strange. "FullWidth" is very wide and would be too wide if theres already spaces around it.
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        " / ", u"/")  # U+FFOF  (wide)       FULLWIDTH SOLIDUS
                    # "Division" slash is too narrow and needs spaces inserted surrounding it (and is still less width than the fullwidth)
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        "/", u" ∕ ")  # U+2215  (narrow)     DIVISION SLASH
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        ":",
                        u"꞉")  # U+A789               MODIFIER LETTER COLON
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        "*", u"※")  # U+203B               REFERENCE MARK
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        "?", u"؟")  # U+061F               ARABIC QUESTION MARK
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        '"', u"ʺ"
                    )  # U+02BA               MODIFIER LETTER DOUBLE PRIME
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        "<", u"˂"
                    )  # U+02C2               MODIFIER LETTER LEFT ARROWHEAD
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        ">", u"˃"
                    )  # U+02C3               MODIFIER LETTER RIGHT ARROWHEAD
                    newEntry.createdpropername = newEntry.createdpropername.replace(
                        "|",
                        u"ǀ")  # U+01C0               LATIN LETTER DENTAL CLICK
                    #####--windows filename banned chars replacement with unicode--#########

                    ######----------HashGrabs-as-Filenames--------########
                    # File output. Move all files named as hashes to a new dir as the proper name
                    if not os.path.exists(hashtofilenamefolder +
                                          newEntry.createdpropername):
                        shutil.copy(
                            hashidfilename,
                            hashtofilenamefolder + newEntry.createdpropername)

                    currentfilenumber += 1
                    #####------------make propernames.txt (has the hash in it also) ---------########
                    # Add it to the container (since this is in a loop)
                    writelistcontainer.append(newEntry.createdpropername +
                                              " / " + tor.torrent.infoHash +
                                              "\n")
    ##File Output. The Master List file of everything.##
    # when the loop exits, Sort it, and write it to the file.
    writelistcontainer.sort()
    for eachline in writelistcontainer:
        writelistfile.write(eachline)
    writelistfile.close()
def main():
    ss = Preferences()

    newfile = open(os.path.join(ss.get("maindir"), "NEWDAT.dat"), 'wb')
    namesandhashfile = open(
        ss.getwpath("outpath3"), 'r',
        encoding='utf-8').readlines()  #("3propernames.txt")

    beforeafterpath = ss.getwpath(
        "outpath4"
    )  #this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt)

    #torrentlist = decoder.decode_from_file(ss.get("utresumedat"))  #works   10.645s 12315181 function calls
    #torrentlist = bencode2en.decode_from_file(ss.get("utresumedat")) #works 8.462s 13745202 function calls
    torrentlist = bencode.decode_from_file(
        ss.get("utresumedat"))  #works  8.057ss 10908143 function calls

    #These two things interfere with the processing on the next line
    fileguarduseless = torrentlist.pop(b".fileguard", None)
    rec = torrentlist.pop(b"rec", None)  #Remove this.
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup = {
        base64.b16encode(value[b"info"]):
        [key, value[b"caption"], value[b"path"]]
        for key, value in torrentlist.items()
    }

    listofbeforeafter = []
    #to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile
    for eachline in namesandhashfile:
        nameandhash = eachline.strip().split(
            ' / '
        )  #strip out the \n with strip() and split on the " / " i put there as a seperator.
        theNewname = nameandhash[0]
        thehash = nameandhash[1]
        #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if bytes(thehash, 'utf-8') in reverselookup:
            key = reverselookup[bytes(thehash, 'utf-8')][0]
            theOldPath = torrentlist[key][b"path"].decode('utf-8')
            theNewPath = os.path.join(os.path.dirname(theOldPath), theNewname)
            if theOldPath != theNewPath:
                listofbeforeafter.append(
                    [theOldPath, theNewPath, thehash]
                )  # make a list of a list (stringtoOutputtoFile=[0], hash=[1])

    #sort, then write file detailing changes to path (before / after)
    listofbeforeafter.sort()
    beforeafterfile = open(beforeafterpath, 'w', encoding='utf-8')
    for eachline in listofbeforeafter:
        beforeafterfile.write(
            eachline[0] + " / " + eachline[2] + "\n"
        )  #write oldpath + hash on 1st line    /The hash is duplicated for error checking in case the user accidentally bungles a character while editing...
        beforeafterfile.write(eachline[1] + " / " + eachline[2] +
                              "\n")  #write newpath + hash on 2nd line   /
    beforeafterfile.close()

    #At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file
    input("Press Enter to begin Renaming files.......\\> "
          )  #wait for the user to press Enter before continuing with anything.

    #WRITE TORRENT RESUME.DAT
    beforeafterfile = open(beforeafterpath, 'r', encoding='utf-8').readlines()
    for i in range(0, len(beforeafterfile), 2):
        beforeandhash = beforeafterfile[i].strip().split(' / ')
        afterandhash = beforeafterfile[i + 1].strip().split(' / ')
        before = beforeandhash[0]
        beforehash = beforeandhash[1]
        after = afterandhash[0]
        afterhash = afterandhash[1]
        if beforehash == afterhash:
            thehash = beforehash
        else:
            print(
                "Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries."
            )
            print(
                "Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it..."
            )
            print(
                "Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator."
            )
        #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if bytes(thehash, 'utf-8') in reverselookup:
            key = reverselookup[bytes(thehash, 'utf-8')][0]
            torrentlist[key][b"caption"] = bytes(after[after.rfind("\\") + 1:],
                                                 'utf-8')
            try:
                # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines).  (tip:) to show incremental numbers use (((i+1)/2)+1)
                # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying)
                print(i, before.encode('ascii', errors='ignore').decode())
                print(i + 1, after.encode('ascii', errors='ignore').decode())
                os.rename(before, after)
            except Exception as e:
                traceback.print_exc(
                )  #will output any errors to console but keep going
            torrentlist[key][b"path"] = bytes(after, 'utf-8')
            if after.endswith(".mp3") or after.endswith(
                    ".flac"
            ):  #.mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner
                if b"targets" in torrentlist[
                        key]:  #these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough
                    torrentlist[key][b"targets"][0][1] = torrentlist[key][
                        b"caption"]  #single-file-mode torrents have a "targets" list that controls the filename

        torrentlist[
            b"rec"] = rec  #add the thing we removed back in so we dont break anything (not sure what this is)
        #fileguard does not need to go back, in fact, purposefully needs to stay out.
    #newfile.write(encode.encode(torrentlist))       #works    10.295s 15361310 function calls
    #newfile.write(bencode2en.bencode2(torrentlist)) #v.slow  31.872s 12452142 function calls
    #newfile.write(bencode2en.bencode4(torrentlist))  #works   7.864s 10906619 function calls
    newfile.write(
        bencode.bencode(torrentlist))  #works    7.699s 10906619 function calls
    newfile.close()
    print(
        "\nPlease note that the filenames shown are missing any unicode characters due to Windows Command Prompt limitations."
    )
    print("Finished writing: ", newfile.name)
def main():
    ss = Preferences()

    newfile = open(os.path.join(ss.get("maindir"),"NEWDAT.dat"),'wb')
    namesandhashfile = open(ss.getwpath("outpath3"),'r',encoding='utf-8').readlines()       #("3propernames.txt")

    beforeafterpath = ss.getwpath("outpath4")   #this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt)

    #torrentlist = decoder.decode_from_file(ss.get("utresumedat"))  #works   10.645s 12315181 function calls
    #torrentlist = bencode2en.decode_from_file(ss.get("utresumedat")) #works 8.462s 13745202 function calls
    torrentlist = bencode.decode_from_file(ss.get("utresumedat"))  #works  8.057ss 10908143 function calls

    #These two things interfere with the processing on the next line 
    fileguarduseless = torrentlist.pop(b".fileguard",None)
    rec = torrentlist.pop(b"rec",None)   #Remove this. 
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup={base64.b16encode(value[b"info"]):[key,value[b"caption"],value[b"path"]] for key,value in torrentlist.items()}

    listofbeforeafter = []
    #to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile   
    for eachline in namesandhashfile:
        nameandhash = eachline.strip().split(' / ')   #strip out the \n with strip() and split on the " / " i put there as a seperator.
        theNewname = nameandhash[0]
        thehash = nameandhash[1]
        #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if bytes(thehash,'utf-8') in reverselookup:
            key = reverselookup[bytes(thehash,'utf-8')][0]
            theOldPath = torrentlist[key][b"path"].decode('utf-8')
            theNewPath = os.path.join(os.path.dirname(theOldPath),theNewname)
            if theOldPath != theNewPath:
                listofbeforeafter.append([theOldPath,theNewPath,thehash])   # make a list of a list (stringtoOutputtoFile=[0], hash=[1])            

    #sort, then write file detailing changes to path (before / after)
    listofbeforeafter.sort()
    beforeafterfile = open(beforeafterpath,'w',encoding='utf-8')
    for eachline in listofbeforeafter:
        beforeafterfile.write(eachline[0] + " / " + eachline[2] + "\n")         #write oldpath + hash on 1st line    /The hash is duplicated for error checking in case the user accidentally bungles a character while editing...
        beforeafterfile.write(eachline[1] + " / " + eachline[2] + "\n")         #write newpath + hash on 2nd line   /
    beforeafterfile.close()

    #At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file
    input("Press Enter to begin Renaming files.......\\> ")  #wait for the user to press Enter before continuing with anything.

    #WRITE TORRENT RESUME.DAT
    beforeafterfile = open(beforeafterpath,'r',encoding='utf-8').readlines()
    for i in range(0, len(beforeafterfile), 2):
        beforeandhash = beforeafterfile[i].strip().split(' / ')
        afterandhash = beforeafterfile[i+1].strip().split(' / ')
        before = beforeandhash[0]
        beforehash = beforeandhash[1]
        after = afterandhash[0]
        afterhash = afterandhash[1]
        if beforehash == afterhash:
            thehash = beforehash
        else:
            print("Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries.")
            print("Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it...")
            print("Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator.")
        #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if bytes(thehash,'utf-8') in reverselookup:
            key = reverselookup[bytes(thehash,'utf-8')][0]
            torrentlist[key][b"caption"] = bytes(after[after.rfind("\\")+1:],'utf-8')
            try:
               # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines).  (tip:) to show incremental numbers use (((i+1)/2)+1) 
               # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying)
                print(i,before.encode('ascii', errors='ignore').decode())
                print(i+1,after.encode('ascii', errors='ignore').decode())
                os.rename(before, after)
            except Exception as e:
                traceback.print_exc()       #will output any errors to console but keep going
            torrentlist[key][b"path"] = bytes(after,'utf-8')
            if after.endswith(".mp3") or after.endswith(".flac"):     #.mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner
                if b"targets" in torrentlist[key]:                     #these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough
                    torrentlist[key][b"targets"][0][1] = torrentlist[key][b"caption"]           #single-file-mode torrents have a "targets" list that controls the filename

        torrentlist[b"rec"]=rec   #add the thing we removed back in so we dont break anything (not sure what this is)
                                #fileguard does not need to go back, in fact, purposefully needs to stay out.
    #newfile.write(encode.encode(torrentlist))       #works    10.295s 15361310 function calls
    #newfile.write(bencode2en.bencode2(torrentlist)) #v.slow  31.872s 12452142 function calls
    #newfile.write(bencode2en.bencode4(torrentlist))  #works   7.864s 10906619 function calls
    newfile.write(bencode.bencode(torrentlist))     #works    7.699s 10906619 function calls
    newfile.close()
    print("\nPlease note that the filenames shown are missing any unicode characters due to Windows Command Prompt limitations.")
    print("Finished writing: ", newfile.name)
def main():

    global fmttdcatalogueNumber     #to fix an issue with scope (line 330,333).

    ss = Preferences()
    hashtorrlistfile = ss.getwpath("outpath1")
    directory_path = ss.getwpath("script2destdir")   #as source dir (hash-grabs)
    allfiles = [os.path.join(directory_path, filename) for filename in next(os.walk(directory_path))[2]]  # gives absolute paths + names

    hashtofilenamefolder = ss.getwpath("script3destdir") #as dest dir (hash-grabs-as-filenames)
    writelistfile = codecs.open(ss.getwpath("outpath3"), 'wb', "utf-8")  # write-out a text file with one entry per line. main output file (3propernames.txt)

    writelistcontainer = []
    currentfilenumber = 1

    for hashidfilename in allfiles:  # iterate through filenames of what.cd JSON data

        with open(hashidfilename, 'r') as stringfile:  # open them
            needFixLabeltoNewEdition = False

            jsonresponse = json.load(stringfile)
            tor = Torrent(jsonresponse)

            if tor.group.categoryName != "Music":
                continue  # do not continue altering any non-music torrents.

            releaseTypeName = ReleaseType(tor.group.releaseType).name  # turn int. value into a string using the enum class above
            fmttdreleaseTypeName = "[" + releaseTypeName + "]"

            if tor.torrent.remastered:
                if tor.torrent.remasterTitle:
                    tor.group.name += " (" + tor.torrent.remasterTitle + ")"
                if tor.torrent.remasterYear > tor.group.year:
                    tor.group.year = tor.torrent.remasterYear
                if tor.torrent.remasterRecordLabel:
                    if tor.group.recordLabel.lower() != tor.torrent.remasterRecordLabel.lower():  # so not case sensitive
                        if not tor.group.recordLabel:
                            tor.group.recordLabel = tor.torrent.remasterRecordLabel
                        else:
                            # then things get complicated and we need to figure out which Label/catalog field is the best one to use, or combine them or both
                            needFixLabeltoNewEdition = True

                # if its been determined that its a remaster (new edition), process new label and catalog
                #  checking whether to combine with old, or which one to use, etc, etc.....
                if needFixLabeltoNewEdition == True:

                    score = difflib.SequenceMatcher(None, tor.group.recordLabel.lower(), tor.torrent.remasterRecordLabel.lower()).ratio()
                    if (score < 0.5):
                        # considered similar at 0.6 but this way is not that accurate. if they are lower than 0.5 similar, just use the new one
                        tor.group.recordLabel = tor.torrent.remasterRecordLabel
                        if tor.torrent.remasterCatalogueNumber:
                            tor.group.catalogueNumber = tor.torrent.remasterCatalogueNumber  # if tor.torrent.remasterCatalogueNumber is not blank, use it as the new tor.group.catalogueNumber

                    elif all(word in tor.torrent.remasterRecordLabel.lower() for word in tor.group.recordLabel.lower()) \
                            or \
                            all(word in tor.group.recordLabel.lower() for word in tor.torrent.remasterRecordLabel.lower()):
                        # If all the words in the old label is encompassed in the new one, use the new one.
                        # This would mean the new edition record label is most likely longer and is similar enough to use that,
                        # and preferred, since its more applicable to this specific release anyway.
                        # Even if the reverse is true, this code-block should only catch labels that differ in slight ways.(?)

                        # example 1: originallabel={Big Beat Records}  remasterlabel={Big Beat}          #elif new label in old label
                        #   result:     {Big Beat}                                                      #or
                        # example 2: originallabel={Big Beat}  remasterlabel={Big Beat Records}          #if old label in new label
                        #   result:     {Big Beat Records}                                              #then
                        tor.group.recordLabel = tor.torrent.remasterRecordLabel  # always choose new label.
                        if tor.torrent.remasterCatalogueNumber:
                            tor.group.catalogueNumber = tor.torrent.remasterCatalogueNumber  # if tor.torrent.remasterCatalogueNumber is not blank, use it as the new tor.group.catalogueNumber
                    else:
                        # This else-block is used when the above are not true, and we can't decide on which one to use, so we use both. Combined.

                        # example 1: originallabel={Island Records}  remasterlabel={Island Records / Lokal Legend}
                        #   result:     {Island Records / Lokal Legend}
                        # example 2: originallabel {Wall Recordings}  remasterlabel={Tiger Records}
                        #   result:     {Wall Recordings / Tiger Records}  
                        new = ""
                        splitorig = re.sub("[(,),-]", " ",
                                           tor.group.recordLabel).split()  # remove delimiter chars that mess up stuff
                        sepchar = ["/"]
                        splitnew = re.sub("[(,),-]", " ", tor.torrent.remasterRecordLabel).split()  # turn everything into a list

                        new = " ".join(["%s" % (v) for v in getUniqueWords(
                            splitorig + sepchar + splitnew)])  # append unique words to the orig.

                        if tor.torrent.remasterCatalogueNumber:
                            if tor.group.catalogueNumber != tor.torrent.remasterCatalogueNumber:
                                if tor.group.catalogueNumber:
                                    tor.group.catalogueNumber += " / " + tor.torrent.remasterCatalogueNumber
                                else:
                                    tor.group.catalogueNumber = tor.torrent.remasterCatalogueNumber
                        tor.group.recordLabel = new

            # ntpath.basename was really slow so doing it manually.... (32 times faster)= 0.128 seconds vs 0.004 seconds
            # whats happening here is due to an exponential nested for loop, ie: 4129 results^2 = 17 million function calls of either basename or .rfind('\\')
            # there should be a better way to do this.                        
            hashfilesepidloc = hashidfilename.rfind("\\") + 1
            cmphashfn = hashidfilename[hashfilesepidloc:]
            iterhashfile = open(hashtorrlistfile, 'r',encoding='utf-8').readlines()  # read everything into memory
            for i in iterhashfile:  # read line
                splitline = i.strip().split(' / ')      # 0 torrentID / 1 Hash / 2 torrentfilename
                if splitline[1] == cmphashfn:  # if it matches, start processing
                    newEntry = TorrentEntry()  # instanciate class
                    newEntry.hash = splitline[1]  # store Hash for reference
                    newEntry.pathname = splitline[2]  # filename + extension
                    locextension = newEntry.pathname.find(".torrent")  # location of extension
                    locid = newEntry.pathname.rfind("-") + 1  # location of tor.torrent.id
                    newEntry.filename = newEntry.pathname[:locextension]  # chop the extension off (manually)
                    newEntry.artistalbum = newEntry.filename[:locid - 1]  # JUST the name (no ID#)
                    newEntry.torrentid = newEntry.filename[locid:locextension]  # grab ID for future reference (tor.torrent.id on what.cd)
                    # example : S-Type - Billboard (Lido Remix) - 2014 (WEB - MP3 - 320)
                    newEntry.artist = newEntry.artistalbum[:newEntry.artistalbum.find(" - ")]  # grab artist
                    tempalbum = newEntry.artistalbum[newEntry.artistalbum.find(" - ") + 3:]  # temp value helps with string processing
                    newEntry.album = tempalbum[:tempalbum.find(" - ")]  # not needed since it can be pulled from [group]
                    newEntry.year = tempalbum[tempalbum.find(" - ") + 3:tempalbum.find(" - ") + 7]  # not needed since it can be pulled from [group]

                    # ------------Recreate name------------#
                    # -------Special RULES SECTION---------#
                    newEntry.createdpropername = newEntry.artist + " - " + tor.group.name + " "
                    if tor.group.releaseType > 1:  # dont put it for Album or Unspecified
                        if tor.group.releaseType != 5:  # do something different for EP
                            newEntry.createdpropername += fmttdreleaseTypeName + " "
                        else:  # make a rule so [EP] doesnt come up if there is " EP " already
                            if tor.group.name[-2:] != "EP":
                                newEntry.createdpropername += fmttdreleaseTypeName + " "

                    newEntry.createdpropername += "(" + str(tor.group.year) + ")"

                    # written like this for easy humanreading
                    #           format = MP3, FLAC, AAC,
                    #          media = cd, web, vinyl, soundboard, dat
                    #        encoding = lossless,320,v0,256,v2,192
                    if tor.torrent.format == "FLAC":
                        newEntry.fmttdMediaEncodeFormat = "FLAC"
                        # log and logscore only applicable to flac.
                        if tor.torrent.hasLog:
                            newEntry.fmttdMediaEncodeFormat += " " + str(
                                tor.torrent.logScore) + "%"  # the % implies "log" so leave out the word log
                    if tor.torrent.format == "AAC":
                        if (any("itunes" in word.lower() for word in tor.torrent.description.split())) or (
                        any("itunes" in word.lower() for word in tor.torrent.filePath.split())):
                            newEntry.fmttdMediaEncodeFormat = "iTunes "
                        newEntry.fmttdMediaEncodeFormat += "AAC"
                    if tor.torrent.format == "MP3":
                        # dont actually write mp3
                        # only write Scene or WEB if it is an mp3
                        if tor.torrent.scene:
                            newEntry.fmttdMediaEncodeFormat += "Scene"
                        elif tor.torrent.media == "WEB":
                            newEntry.fmttdMediaEncodeFormat += "WEB"
                        else:
                            newEntry.fmttdMediaEncodeFormat += tor.torrent.media
                        if "VBR" in tor.torrent.encoding:
                            newEntry.fmttdMediaEncodeFormat += " " + tor.torrent.encoding[:2]
                        else:
                            newEntry.fmttdMediaEncodeFormat += " " + tor.torrent.encoding

                    newEntry.fmttdMediaEncodeFormat = "[" + newEntry.fmttdMediaEncodeFormat + "]"

                    newEntry.createdpropername += " " + newEntry.fmttdMediaEncodeFormat

                    # put catalog number in brackets
                    if tor.group.catalogueNumber:
                        fmttdcatalogueNumber = ("[" + tor.group.catalogueNumber + "]").replace(" ", "").upper()
                    elif tor.group.recordLabel:
                        fmttdcatalogueNumber = "[" + tor.group.recordLabel + "]"  # combines with next part to put recordLabel in the front if Cat# missing

                    if any(word in tor.group.recordLabel.lower() for word in Sorted_Record_Labels_List):  # so not case sensitive
                        newEntry.createdpropername = fmttdcatalogueNumber + " " + newEntry.createdpropername
                    elif tor.group.recordLabel:
                        newEntry.createdpropername += " " + "{" + tor.group.recordLabel + "}"
                        # if tor.group.catalogueNumber:                                                       #This will put [CATA###] after all releases, even labels not in your list
                        #     newEntry.createdpropername += " " + fmttdcatalogueNumber              #Gets kind of cumbersome for me. (uncomment to use it anyway)

                    # these 2 lines are a quick fix, for an oversight in my naming process
                    # if these are single file .mp3's (or a single .flac) they will need a .mp3 at the end of the filename
                    if not tor.torrent.filePath:
                        newEntry.createdpropername += "." + tor.torrent.format.lower()

                    try:
                        print(currentfilenumber, newEntry.createdpropername.encode('ascii', errors='ignore').decode())
                    except:
                        print("COULD NOT PRINT UNICODE FILENAME TO CONSOLE. HASH=", tor.torrent.infoHash)

                    ########-------------replace characters section----------------#########
                    newEntry.createdpropername = newEntry.createdpropername.replace("\\","\")  # U+FF3C               FULLWIDTH REVERSE SOLIDUS
                    # these forward slashes are strange. "FullWidth" is very wide and would be too wide if theres already spaces around it.
                    newEntry.createdpropername = newEntry.createdpropername.replace(" / ","/")  # U+FFOF  (wide)       FULLWIDTH SOLIDUS
                    # "Division" slash is too narrow and needs spaces inserted surrounding it (and is still less width than the fullwidth)
                    newEntry.createdpropername = newEntry.createdpropername.replace("/"," ∕ ")  # U+2215  (narrow)     DIVISION SLASH
                    newEntry.createdpropername = newEntry.createdpropername.replace(":","꞉")  # U+A789               MODIFIER LETTER COLON
                    newEntry.createdpropername = newEntry.createdpropername.replace("*","※")  # U+203B               REFERENCE MARK
                    newEntry.createdpropername = newEntry.createdpropername.replace("?","؟")  # U+061F               ARABIC QUESTION MARK
                    newEntry.createdpropername = newEntry.createdpropername.replace('"',"ʺ")  # U+02BA               MODIFIER LETTER DOUBLE PRIME
                    newEntry.createdpropername = newEntry.createdpropername.replace("<","˂")  # U+02C2               MODIFIER LETTER LEFT ARROWHEAD
                    newEntry.createdpropername = newEntry.createdpropername.replace(">","˃")  # U+02C3               MODIFIER LETTER RIGHT ARROWHEAD
                    newEntry.createdpropername = newEntry.createdpropername.replace("|","ǀ")  # U+01C0               LATIN LETTER DENTAL CLICK
                    #####--windows filename banned chars replacement with unicode--#########

                    ######----------HashGrabs-as-Filenames--------########
                    # File output. Move all files named as hashes to a new dir as the proper name
                    if not os.path.exists(hashtofilenamefolder + newEntry.createdpropername):
                        shutil.copy(hashidfilename, hashtofilenamefolder + newEntry.createdpropername)

                    currentfilenumber += 1
                    #####------------make propernames.txt (has the hash in it also) ---------########
                    # Add it to the container (since this is in a loop)
                    writelistcontainer.append(newEntry.createdpropername + " / " + tor.torrent.infoHash + "\n")
    ##File Output. The Master List file of everything.##                    
    # when the loop exits, Sort it, and write it to the file.
    writelistcontainer.sort()
    for eachline in writelistcontainer:
        writelistfile.write(eachline)
    writelistfile.close()
def main():
    ss = Preferences()
    script1sourcedir = ss.getwpath(u"script1sourcedir")+u''            #("seeding\"), needs unicode u for file opening.
    files = [os.path.join(script1sourcedir,filename) for filename in next(os.walk(script1sourcedir))[2]]        #gives absolute paths + names

    currentfile = 0

    container = []    #set up an empty container for desired data to get put into for later
    for eachfile in files:

        metainfo = bencode.decode_from_file(eachfile)
        # #need to manually SHA1 hash the torrent file's info-dict to get the info-hash
        infodict = metainfo['info']
        info_hash = hashlib.sha1(bencode.bencode(infodict)).hexdigest().upper()

        internalname = infodict['name']
        torrentfilename = eachfile[eachfile.rfind("\\")+1:]
        locextension = torrentfilename.find(".torrent")           #location of extension (char position)
        locid = torrentfilename.rfind("-")+1                      #location of torrentID (char position)
        torrentid = torrentfilename[locid:locextension]           #grab torrentID 
        
        torrentfilename = torrentfilename[:locid-1]

        #####-------------replace banned characters with unicode section-----------------######
        ###
        # Forward slashes are strange. "FullWidth" is very wide and would be too wide if theres already spaces around it.
        torrentfilename = torrentfilename.replace(" / ",u"/")  # U+FFOF  (wide)       FULLWIDTH SOLIDUS
        # "Division" slash is too narrow and needs spaces inserted surrounding it (and is still less width than the fullwidth)
        torrentfilename = torrentfilename.replace("/",u" ∕ ")  # U+2215  (narrow)     DIVISION SLASH
        # Backslash (requires two slashes in python)
        torrentfilename = torrentfilename.replace("\\",u"\")  # U+FF3C               FULLWIDTH REVERSE SOLIDUS
        # Colon
        torrentfilename = torrentfilename.replace(":",u"꞉")  # U+A789               MODIFIER LETTER COLON
        # asterisk
        torrentfilename = torrentfilename.replace("*",u"※")  # U+203B               REFERENCE MARK
        # question mark (replacement is backwards, sorry)
        torrentfilename = torrentfilename.replace("?",u"؟")  # U+061F               ARABIC QUESTION MARK
        # Double-quote
        torrentfilename = torrentfilename.replace('"',u"ʺ")  # U+02BA               MODIFIER LETTER DOUBLE PRIME
        # Left angle bracket
        torrentfilename = torrentfilename.replace("<",u"˂")  # U+02C2               MODIFIER LETTER LEFT ARROWHEAD
        # right angle bracket
        torrentfilename = torrentfilename.replace(">",u"˃")  # U+02C3               MODIFIER LETTER RIGHT ARROWHEAD
        # Pipe
        torrentfilename = torrentfilename.replace("|",u"ǀ")  # U+01C0               LATIN LETTER DENTAL CLICK
        ###
        #####----------windows filename banned chars replacement with unicode-----------######

        container.append([torrentfilename, internalname, info_hash, torrentid])
        currentfile += 1 
        print currentfile, torrentfilename.encode('ascii', errors='ignore')

    print "\nReminder: Console output is ascii only, Cannot Print Unicode. (chars omitted)"
    ##File Output. The Master List file of everything.##                    
    # when the loop exits, Sort it, and write it to the file.
    container.sort()
    writelistfile = codecs.open(ss.getwpath("outpath3"), 'wb', "utf-8")  # write-out a text file with one entry per line. main output file (3propernames.txt)
    for eachline in container:
        writelistfile.write(eachline[0] + " / " + eachline[2] + "\n")   #torrentname  / infohash
    writelistfile.close()
    print "Completed. Unicode File Written to: ", os.path.basename(ss.getwpath("outpath3"))
示例#18
0
    def _calc_terminal_scores(self, w):
        """ Calculate the score for each possible terminal/token match """

        # First pass: for each token, find the possible terminals that
        # can correspond to that token
        finals = defaultdict(set)
        tokens = dict()
        self._find_options(w, finals, tokens)

        # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token
        scores = dict()
        noun_prefs = NounPreferences.DICT

        # Loop through the indices of the tokens spanned by this tree
        for i in range(w.start, w.end):

            s = finals[i]
            # Initially, each alternative has a score of 0
            scores[i] = {terminal: 0 for terminal in s}

            if len(s) <= 1:
                # No ambiguity to resolve here
                continue

            # More than one terminal in the option set for the token at index i
            # Calculate the relative scores
            # Find out whether the first part of all the terminals are the same
            same_first = len(set(terminal.first for terminal in s)) == 1
            txt = tokens[i].lower
            # Get the last part of a composite word (e.g. 'jaðar-áhrifin' -> 'áhrifin')
            txt_last = txt.rsplit('-', maxsplit=1)[-1]
            # No need to check preferences if the first parts of all possible terminals are equal
            # Look up the preference ordering from Reynir.conf, if any
            prefs = None if same_first else Preferences.get(txt_last)
            sc = scores[i]
            if prefs:
                adj_worse = defaultdict(int)
                adj_better = defaultdict(int)
                for worse, better, factor in prefs:
                    for wt in s:
                        if wt.first in worse:
                            for bt in s:
                                if wt is not bt and bt.first in better:
                                    if bt.name[0] in "\"'":
                                        # Literal terminal: be even more aggressive in promoting it
                                        adj_w = -2 * factor
                                        adj_b = +6 * factor
                                    else:
                                        adj_w = -2 * factor
                                        adj_b = +4 * factor
                                    adj_worse[wt] = min(adj_worse[wt], adj_w)
                                    adj_better[bt] = max(adj_better[bt], adj_b)
                for wt, adj in adj_worse.items():
                    sc[wt] += adj
                for bt, adj in adj_better.items():
                    sc[bt] += adj

            # Apply heuristics to each terminal that potentially matches this token
            for t in s:
                tfirst = t.first
                if tfirst == "ao" or tfirst == "eo":
                    # Subtract from the score of all ao and eo
                    sc[t] -= 1
                elif tfirst == "no":
                    if t.is_singular:
                        # Add to singular nouns relative to plural ones
                        sc[t] += 1
                    elif t.is_abbrev:
                        # Punish abbreviations in favor of other more specific terminals
                        sc[t] -= 1
                    if tokens[i].is_upper and tokens[i].is_word and tokens[
                            i].t2:
                        # Punish connection of normal noun terminal to
                        # an uppercase word that can be a person or entity name
                        if any(m.fl in {"ism", "föð", "móð", "örn", "fyr"}
                               for m in tokens[i].t2):
                            # logging.info("Punishing connection of {0} with 'no' terminal".format(tokens[i].t1))
                            sc[t] -= 5
                    # Noun priorities, i.e. between different genders
                    # of the same word form
                    # (for example "ára" which can refer to three stems with different genders)
                    if txt_last in noun_prefs:
                        np = noun_prefs[txt_last].get(t.gender, 0)
                        sc[t] += np
                elif tfirst == "fs":
                    if t.has_variant("nf"):
                        # Reduce the weight of the 'artificial' nominative prepositions
                        # 'næstum', 'sem', 'um'
                        sc[t] -= 8  # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5)
                    elif txt == "við" and t.has_variant("þgf"):
                        sc[t] += 1  # Smaller bonus for við + þgf (is rarer than við + þf)
                    elif txt == "sem" and t.has_variant("þf"):
                        sc[t] -= 4
                    elif txt == "á" and t.has_variant("þgf"):
                        sc[t] += 4  # Larger bonus for á + þgf to resolve conflict with verb 'eiga'
                    else:
                        # Else, give a bonus for each matched preposition
                        sc[t] += 2
                elif tfirst == "so":
                    if t.num_variants > 0 and t.variant(0) in "012":
                        # Consider verb arguments
                        # Normally, we give a bonus for verb arguments: the more matched, the better
                        numcases = int(t.variant(0))
                        adj = 2 * numcases
                        # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd'
                        if numcases == 0:
                            # Zero arguments: we might not like this
                            vo0 = VerbObjects.VERBS[0]
                            if all(
                                (m.stofn not in vo0) and (m.ordmynd not in vo0)
                                    and ("MM" not in m.beyging)
                                    for m in tokens[i].t2 if m.ordfl == "so"):
                                # No meaning where the verb has zero arguments
                                # print("Subtracting 5 points for 0-arg verb {0}".format(tokens[i].t1))
                                adj = -5
                        # Apply score adjustments for verbs with particular object cases,
                        # as specified by $score(n) pragmas in Verbs.conf
                        # In the (rare) cases where there are conflicting scores,
                        # apply the most positive adjustment
                        adjmax = 0
                        for m in tokens[i].t2:
                            if m.ordfl == "so":
                                key = m.stofn + t.verb_cases
                                score = VerbObjects.SCORES.get(key)
                                if score is not None:
                                    adjmax = score
                                    break
                        sc[t] += adj + adjmax
                    if t.is_sagnb:
                        # We like sagnb and lh, it means that more
                        # than one piece clicks into place
                        sc[t] += 6
                    elif t.is_lh:
                        # sagnb is preferred to lh, but vb (veik beyging) is discouraged
                        if t.has_variant("vb"):
                            sc[t] -= 2
                        else:
                            sc[t] += 3
                    elif t.is_lh_nt:
                        sc[t] += 12  # Encourage LHNT rather than LO
                    elif t.is_mm:
                        # Encourage mm forms. The encouragement should be better than
                        # the score for matching a single case, so we pick so_0_mm
                        # rather than so_1_þgf, for instance.
                        sc[t] += 3
                    elif t.is_vh:
                        # Encourage vh forms
                        sc[t] += 2
                    if t.is_subj:
                        # Give a small bonus for subject matches
                        if t.has_variant("none"):
                            # ... but a punishment for subj_none
                            sc[t] -= 3
                        else:
                            sc[t] += 1
                    if t.is_nh:
                        if (i > 0) and any(pt.first == 'nhm'
                                           for pt in finals[i - 1]):
                            # Give a bonus for adjacent nhm + so_nh terminals
                            sc[t] += 4  # Prop up the verb terminal with the nh variant
                            for pt in scores[i - 1].keys():
                                if pt.first == 'nhm':
                                    # Prop up the nhm terminal
                                    scores[i - 1][pt] += 2
                                    break
                        if any(pt.first == "no" and pt.has_variant("ef")
                               and pt.is_plural for pt in s):
                            # If this is a so_nh and an alternative no_ef_ft exists, choose this one
                            # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða')
                            sc[t] += 4
                    if (i > 0) and tokens[i].is_upper:
                        # The token is uppercase and not at the start of a sentence:
                        # discourage it from being a verb
                        sc[t] -= 4
                elif tfirst == "tala":
                    if t.has_variant("ef"):
                        # Try to avoid interpreting plain numbers as possessives
                        sc[t] -= 4
                elif tfirst == "person":
                    if t.has_variant("nf"):
                        # Prefer person names in the nominative case
                        sc[t] += 2
                elif tfirst == "sérnafn":
                    if not tokens[i].t2:
                        # If there are no BÍN meanings, we had no choice but to use sérnafn,
                        # so alleviate some of the penalty given by the grammar
                        sc[t] += 4
                    else:
                        # BÍN meanings are available: discourage this
                        # print(f"Discouraging sérnafn {txt}, BÍN meanings are {tokens[i].t2}")
                        sc[t] -= 10
                        if i == w.start:
                            # First token in sentence, and we have BÍN meanings:
                            # further discourage this
                            sc[t] -= 6
                elif tfirst == "fyrirtæki":
                    # We encourage company names to be interpreted as such,
                    # so we give company abbreviations ('hf.', 'Corp.', 'Limited')
                    # a high priority
                    sc[t] += 24
                elif tfirst == "st" or (tfirst == "sem"
                                        and t.colon_cat == "st"):
                    if txt == "sem":
                        # Discourage "sem" as a pure conjunction (samtenging)
                        # (it does not get a penalty when occurring as
                        # a connective conjunction, 'stt')
                        sc[t] -= 6
                elif tfirst == "abfn":
                    # If we have number and gender information with the reflexive
                    # pronoun, that's good: encourage it
                    sc[t] += 6 if t.num_variants > 1 else 2
                elif tfirst == "gr":
                    # Encourage separate definite article rather than pronoun
                    sc[t] += 2
                elif t.name[0] in "\"'":
                    # Give a bonus for exact or semi-exact matches
                    sc[t] += 1

        return scores
def main():
    ss = Preferences()

    oldfile = open(ss.get("utresumedat"), "rb").read()
    newfile = open(os.path.join(ss.get("maindir"), u"NEWDAT.dat"), "wb")
    namesandhashfile = open(ss.getwpath("outpath3"), "rb").readlines()

    beforeafterpath = ss.getwpath(
        "outpath4"
    )  # this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt)

    torrentlist = bencode.bdecode(oldfile)

    # These two things interfere with the processing on the next line
    fileguarduseless = torrentlist.pop(".fileguard", None)
    rec = torrentlist.pop("rec", None)  # Remove this.
    # (dict. comprehension expects only dicts as the root keys)
    # create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup = {
        base64.b16encode(value["info"]): [key, value["caption"], value["path"]]
        for key, value in torrentlist.iteritems()
    }

    listofbeforeafter = []
    # to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile
    for eachline in namesandhashfile:
        nameandhash = eachline.strip().split(
            " / "
        )  # strip out the \n with strip() and split on the " / " i put there as a seperator.
        theNewname = nameandhash[0]
        thehash = nameandhash[1]
        # searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if thehash in reverselookup:
            key = reverselookup[thehash][0]
            theOldPath = torrentlist[key]["path"]
            theNewPath = os.path.join(os.path.dirname(theOldPath), theNewname)
            if theOldPath != theNewPath:
                listofbeforeafter.append(
                    [theOldPath, theNewPath, thehash]
                )  # make a list of a list (stringtoOutputtoFile=[0], hash=[1])

    # sort, then write file detailing changes to path (before / after)
    listofbeforeafter.sort()
    beforeafterfile = open(beforeafterpath, "wb")
    for eachline in listofbeforeafter:
        try:
            beforeafterfile.write(
                eachline[0] + " / " + eachline[2] + "\n"
            )  # write oldpath + hash on 1st line    /The hash is duplicated for error checking in case the user accidentally bungles a character while editing...
            beforeafterfile.write(eachline[1] + " / " + eachline[2] + "\n")  # write newpath + hash on 2nd line   /
        except:
            print "Error writing the before+after file, probably a encoding/unicode error: \n", eachline[
                0
            ], "\n", eachline[1]
            print "This was a fatal error and program could not continue."
            return
    beforeafterfile.close()

    # At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file
    raw_input(
        "Press Enter to begin Renaming files.......\\> "
    )  # wait for the user to press Enter before continuing with anything.

    # WRITE TORRENT RESUME.DAT
    beforeafterfile = open(beforeafterpath, "rb").readlines()
    for i in xrange(0, len(beforeafterfile), 2):
        beforeandhash = beforeafterfile[i].strip().split(" / ")
        afterandhash = beforeafterfile[i + 1].strip().split(" / ")
        before = beforeandhash[0].decode("utf-8")
        beforehash = beforeandhash[1]
        after = afterandhash[0].decode("utf-8")
        afterhash = afterandhash[1]
        if beforehash == afterhash:
            thehash = beforehash
        else:
            print "Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries."
            print "Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it..."
            print "Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator."
        # searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if thehash in reverselookup:
            key = reverselookup[thehash][0]
            torrentlist[key]["caption"] = after[after.rfind("\\") + 1 :]
            try:
                # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines).  (tip:) to show incremental numbers use (((i+1)/2)+1)
                # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying)
                print i, before.encode("ascii", errors="ignore")
                print i + 1, after.encode("ascii", errors="ignore")
                os.rename(before, after)
            except Exception as e:
                traceback.print_exc()  # will output any errors to console but keep going
            torrentlist[key]["path"] = after
            if after.endswith(".mp3") or after.endswith(
                ".flac"
            ):  # .mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner
                if torrentlist[key].has_key(
                    "targets"
                ):  # these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough
                    torrentlist[key]["targets"][0][1] = after[
                        after.rfind("\\") + 1 :
                    ]  # single-file-mode torrents have a "targets" list that controls the filename

    torrentlist["rec"] = rec  # add the thing we removed back in so we dont break anything (not sure what this is)
    # fileguard does not need to go back, in fact, purposefully needs to stay out.
    newfile.write(bencode.bencode(torrentlist))
    newfile.close()
    print "Finished writing: ", newfile.name