示例#1
0
def demo(input=""):
   
    print("Reading the documents (from json dataset)... input should be 'Data/jsonFile#jsonIndex")
    jsonFile = input.split("#")[0]
    jsonIndex = input.split("#")[1]
    cliqueOfArticles = functions.readJsonFile(jsonFile)
    contents = cliqueOfArticles[jsonIndex]["contents"]
    publications = cliqueOfArticles[jsonIndex]["publications"]
    titles = cliqueOfArticles[jsonIndex]["sentences"]
        
    print("Initialize object...")
    gDoc=graphDoc.graphDocuments(contents,publications,titles)   # initialize object with documents and publication classes e.g. cnn, fox
    
    print("Extracting sentence structure...")
    gDoc.sentenceProcess(withGA=True, output="temp/sentences.pkl")
    
    print("Computing sentence similarities...")
    gDoc.computeSentenceDistances(similarityFunction = "cosine")

    print("Keeping only the most important sentence-to-sentence similarities (thresholding)...")
    gDoc.reduceSentenceSimilarityFrame(pA=85,pB=93)

    print("Create graph... (no plotting)")
    gDoc.computeNetwork(plot=False,cliqueEdges=[])

    print("Clique finder in the graph...")
    gDoc.cliqueFinder(output="temp/cliquesFinal.json",orderby="median tf-idf score")
示例#2
0
def load_clique_results(glob_expression):
    file_matches = glob.glob(glob_expression)
    result = {}
    for f_name in file_matches:
        f_content = functions.readJsonFile(f_name)
        result[f_name] = f_content
    return result
示例#3
0
def all_clique_processing(jsonFile):
    # Find cross-referenced pieces if information
    print("Reading the documents")
    cliqueOfArticles = functions.readJsonFile(jsonFile)
    items = cliqueOfArticles.items()

    pool = Pool(processes=8)
    for clique_id in tqdm(pool.imap_unordered(fn_wrap, items),
                          desc='outer loop'):
        print(clique_id, 'done')
示例#4
0
def stats(jsonFile):
    # Computes some stats about initial cliques
    print('computing stats about input')
    cliqueOfArticles = functions.readJsonFile(jsonFile)
    size_acc = 0
    size_distrib = []
    clique_sim_distrib = []
    per_outlet = defaultdict(lambda: 0)
    for k, v in cliqueOfArticles.items():
        publications = v['publications']
        size_acc += len(publications)
        size_distrib.append(float(len(publications)))
        clique_sim_distrib.append(v['score'])
        for o in publications:
            per_outlet[o] += 1

    print('total articles', size_acc)
    print('#cliques', len(cliqueOfArticles))
    print('average clique len', size_acc / len(cliqueOfArticles))
    print(size_distrib)
    plot.plot_distribution(size_distrib, 'temp/fig_clique_size_distrib.png')
    plot.plot_distribution(clique_sim_distrib,
                           'temp/fig_clique_sim_distrib.png')
    print(per_outlet)
示例#5
0
    'based on the authors\' [demo](http://fairnews.ewi.tudelft.nl/InCredible/) '
    'and [source code](https://github.com/dbountouridis/InCredible). ')
st.sidebar.info(
    'The code for this demo is [here](https://github.com/MartinoMensio/InCredible)'
)
st.sidebar.title('Instructions')
st.sidebar.info(
    'Select a document clique to load: each one represents a different story')
st.sidebar.info('Select a main source to see the corresponding article')

doc_cliques_input_file = st.text_input('Document cliques input file:',
                                       'Data/dataset.json')
output_path = st.text_input('Path where the computation results are:',
                            'temp/cliques_GA/')

document_cliques = functions.readJsonFile(doc_cliques_input_file)
document_cliques_ids = list(document_cliques.keys())

default = '0.7996630192184154-7-6180'
chosen_doc_clique_id = st.selectbox('Document clique to load:',
                                    document_cliques_ids,
                                    document_cliques_ids.index(default))
chosen_doc_clique = document_cliques[chosen_doc_clique_id]

clique_outputs = functions.readJsonFile(
    Path(output_path) / f'cliques_{chosen_doc_clique_id}.json')
st.text(f'Clique {chosen_doc_clique_id} selected')

st.text('Titles:\n\n' + '\n'.join([
    f'{p}--> {t}' for p, t in zip(chosen_doc_clique['publications'],
                                  chosen_doc_clique['sentences'])