def generate_rels(filename):
    """Create relationships data for a graph.
    
    Prepare a list of lists of integers to be written as graph links
    in a csv file. 
    
    Parameters
    ---------
    filename: str
        The data to use for the relationships.
        The data file must contain a list of lists.
    
    Yields
    ------
    list
        A link between two nodes.
    
    """
    all_links = set()

    for i in load_row_by_row(filename):
        if len(i) >= 2:
            ads = list(set(i))
            for ad in range(len(ads)):
                try:
                    link = (ads[0], ads[ad + 1])
                    all_links.add(link)
                except:
                    continue

    for link in all_links:
        yield link
def generate_nodes(filename):
    """Create nodes data for a graph.
    
    Prepare the data to be written in a csv file. The function
    generate the data for each column in the csv.
    
    Parameters
    ----------
    filename: str
        The data to use for the nodes.
        The data file must contain a list of lists.
    
    Yields
    ------
    list
        A list containing to values, the node id and the node name
    
    """
    n_ids = set()

    for i in load_row_by_row(filename):
        if len(i) >= 2:
            ads = set(i)
            for ad in ads:
                n_ids.add(ad)

    nodes = zip(n_ids, n_ids)
    for node in nodes:
        yield node
def append_rest_of_clusters(load_adds, load_adds_type, load_clusters_file,
                            save_file, save_in_format):
    """Add the rest of the addresses to the clusters.
    
    The dictionary of addresses can be used to add the rest
    of addresses to the clusters. Input an output addresses 
    can also be used. 
        
    Parameters
    ----------
    load_adds: str
        filename to load containing the list of addresses in 
        input or output
    load_adds_type: str, optional
        'list' if the addresses to append are in a list
        'dict' if the addresses to append are in a dictionary
    load_clusters_file: str
        The file containing the clusters in which to append
    save_file:str
        The filename in which to save the result
    save_in_format:str, optional
        4 options: JSON, pickle, npy, hdf5
    
    Returns
    -------
    Write
        Save the result in a new file.
        
 """
    from itertools import chain

    if load_adds_type == list:
        actor = set()
        for i in load_row_by_row(load_adds):
            if len(i) < 2:
                actor.add(i[0])

    elif load_adds_type == dict:
        actor = load_dumped_data(load_adds, "pickle")
        actor = [i for i in range(len(actor))]

    clusters = load_dumped_data(load_clusters_file, "pickle")

    print("Starting point :", len(clusters), "clusters")

    C = invert_dictionary(clusters, list)

    for a in actor:
        if a not in C:
            # clusters[len(clusters)]=[a]
            clusters.update({len(clusters): [a]})

    print("The Total number of the clusters is :", len(clusters))

    dump_variable(clusters, save_file, save_in_format)
def nodes_transactions(files, name_col=None, *functions):
    """Create the transactions nodes.

    Parameters
    ----------
    files: list
        List of the data files names (part-0001, ...)
    name_col: str, optional(default is none)
        Specifies the name of the node
    *functions : functions
        Functions used to create the node and its properties

    Yields
    ------
    list
        A list that will be written in a csv file, each
        element in the list will written in a different column
   
   """
    from itertools import chain

    k = 0
    for file in files:
        # Load a row
        for row in load_row_by_row(file):
            k += 1
            if k % 10000000 == 0:
                print(k, "transactions processed")
            res_row = []

            # apply the functions, each function is for a column
            for fn in functions:
                res_row.append(fn(row))

            if name_col:
                res_row.append([name_col + str(k)])

            res_row = list(chain.from_iterable(res_row))

            yield res_row
def addresses_to_dictionary(all_addresses, save_file, f_format):
    """Create the dictionary of addresses.
    
    Parameters
    ----------
    all_addresses: str
        filename containing the list of addresses
    save_file: str
        The name of the file where to save the dictionary
    
    Returns
    -------
    Save
        Write the dictionary on the hard drive
        
    """
    id_adds = set()
    for g in load_row_by_row(all_addresses):
        for i in g:
            id_adds.add(i)
    id_adds = dict(enumerate(id_adds))
    dump_variable(id_adds, save_file, f_format)
def create_links_tx_addresses(files, dictionary):
    """Create the relationships between the transactions and the addresses.

    This function extract the informations of the properties
    Parameters
    ----------
    files: list
        List of the data files names (part-0001, ...)
    dictionary: dict
        The dictionary of addresses, in which the addresses are keys.

    Yields
    ------
    list
        A list that will be written in a csv file, each element in
        the list will be written in a different column. Columns to
        obtain: txid, address in or out, value, index in or out,
        address in int.

   """
    from itertools import zip_longest, chain

    get = GetInTransaction()
    cnt = 0
    for file in files:
        cnt += 1
        print("Processing file", cnt)
        for row in load_row_by_row(file):
            transaction = row

            ## Addresses
            # Select input/output addresses and convert the strings to integers
            ads_in = get.input_addresses(transaction)
            rewrite(ads_in, dictionary)
            ads_out = get.output_addresses(transaction)
            rewrite(ads_out, dictionary)

            ## Link "in" or "out" and their corresponding indexes to each addresses
            #
            ins = list(zip_longest(ads_in, ["in"], fillvalue="in"))

            # get the indexes and a
            indexin = get.index_in(transaction)
            ins = list(zip(ins, indexin))
            try:
                indexout = get.index_out(transaction)
            except:
                indexout = ["no_indexOut"]

            outs = list(zip_longest(ads_out, ["out"], fillvalue="out"))

            outs = list(zip(outs, indexout))

            ## Values
            values_out = get.output_values(transaction)
            try:
                values_in = get.input_values(transaction)
            except:
                values_in = ["no_value"]

            # link input/output addresses to their bitcoin values
            rows_in = list(zip(ins, values_in))
            rows_out = list(zip(outs, values_out))

            #  Get the transaction id
            tx_id = get.txid(transaction)

            # Prepare data for a csv with 4 columns
            data = []
            data.append(rows_in)
            data.append(rows_out)
            data = list(chain.from_iterable(data))

            for g in data:
                # txid, tx_in_or_out, value, index_in_or_out, add_int
                res = [tx_id[0], g[0][0][1], g[1], g[0][1], g[0][0][0]]

                yield res
def change_address(files, save_file, addresses_ids):
    """Get the change address in a output BTC transaction.
    
    The change address is selected according to the heuristic
    proposed by Athey et al.(2017). In a two-output transaction, 
    if one of the outputs has 3 more decimal places than other
    output value (which has 4 or fewer decimal places), we declare
    the output with the larger number of decimal digits to be the
    change address.
    
    Parameters
    ----------
    files: list
        List of the data files names
    save_file: str
        Filename in which to save the result
    addresses_ids: dict
        Dictionary of addresses in which the addresses are keys
    
    Returns
    -------
    write rows 
        A row is a list containing the change address and 1 input address
        
    """

    import os
    from itertools import chain

    # 1btc = 10**8 satoshi
    if os.path.isfile(save_file):
        os.remove(save_file)

    get = GetInTransaction()
    cnt = 0
    with open(save_file, "w") as save:
        for file in files:
            for tx in load_row_by_row(file):
                cnt += 1
                if cnt % 1000000 == 0:
                    print(cnt, "transactions processed...")
                # if it is a transaction with two outputs, select values and compute the number of decimals of each value
                if len(get.output_values(tx)) == 2:

                    output_addresses = get.output_addresses(tx)
                    values = get.output_values(tx)

                    decimals_indexes = []
                    for ind, j in enumerate(values):
                        for i in range(9):
                            if j % 10**i != 0:
                                decimals_indexes.append(zip([9 - i], [ind]))
                                break
                        else:
                            decimals_indexes.append(zip([0], [ind]))

                    # Select input/output addresses
                    decimals_indexes = [list(i) for i in decimals_indexes]
                    decimals_indexes = list(
                        chain.from_iterable(decimals_indexes))
                    #  conditions to repect for h2 clustering
                    if (decimals_indexes[0][0] <= 4 or decimals_indexes[1][0]
                            <= 4) and abs(decimals_indexes[0][0] -
                                          decimals_indexes[1][0]) > 3:
                        # the change addresse is the addresse with the highest decimal
                        if decimals_indexes[0][0] < decimals_indexes[1][0]:
                            change_addresse = output_addresses[
                                decimals_indexes[1][1]]
                        elif decimals_indexes[0][0] > decimals_indexes[1][0]:
                            change_addresse = output_addresses[
                                decimals_indexes[0][1]]

                        # collect actor addresse's : rassemble input and change addresses
                        same_actor = get.input_addresses(tx)[:1] + [
                            change_addresse
                        ]

                        same_actor = list(set(same_actor))
                        # convert addresses in strings to intergers
                        rewrite(same_actor, addresses_ids)

                        # save the list
                        save.write("%s\n" % same_actor)
                else:
                    continue