def compute_term_features(doc_list, global_term_dict, df_raw, store_filename):
    import math, tabulate, ioData

    # reindex df_raw by link column
    df = df_raw.set_index('link')
    feature_extractor = utils.TextualFeatureExtractor(df)

    nr_docs = len(doc_list)
    nr_terms = reduce(lambda x, y: x + len(y.relevant_terms), doc_list, 0)

    term_dataset = []

    for doc in doc_list:
        print "======== " + doc.url + " ========"
        meta_keywords = df.loc[doc.url]['keywords']

        # 1) compute TF
        # print "Computing TF ... "
        doc.compute_tf()

        # print "Document parsed: "
        # print doc.transformed

        # print "Computing DF, TFIDF and Textual Features ... "
        for term in doc.relevant_terms:
            # 2) compute DF and TFIDF - use global_term_dict
            term.df = global_term_dict[term]
            term.tfidf = term.tf * math.log(1 + float(nr_docs) / float(term.df), 2)
            # print tabulate.tabulate([[term, term.cvalue, term.tf, term.df, term.tfidf]], headers=('term', 'cval', 'tf', 'df', 'tfidf'))
            # print ""

            # 3) compute linguistic features
            term.set_textual_feature_extractor(feature_extractor)
            term.extract_textual_features()

            # print tabulate.tabulate([[term, term.is_title, term.is_url, term.is_first_par, term.is_last_par, term.is_description,
            #                          term.is_img_caption, term.is_anchor, term.doc_position]],
            #                         headers=('term', 'is_title', 'is_url', 'is_first_par', 'is_last_par', 'is_description', 'is_img_desc', 'is_anchor', 'doc_pos'))

            # 4) check if term is in meta keywords (i.e is relevant)
            term.is_keyword = is_meta_keyword(term, meta_keywords)
            # print ":: IS RELEVANT = " + str(term.is_keyword)
            # print ("\n\n")

            term_dataset.append([term.original, doc.url, term.cvalue, term.tf, term.df, term.tfidf,
                                 term.is_title, term.is_url, term.is_first_par, term.is_last_par, term.is_description,
                                 term.is_img_caption, term.is_anchor, term.doc_position, term.is_keyword])

    term_df_headers = ['term', 'doc_url', 'cvalue', 'tf', 'df', 'tfidf', 'is_title', 'is_url',
                        'is_first_par', 'is_last_par', 'is_description',
                        'is_img_desc', 'is_anchor', 'doc_pos', 'relevant']

    term_df = pd.DataFrame(term_dataset, columns=term_df_headers)
    ioData.writeData(term_df, store_filename)
def create_term_train_test_dataset(global_term_feature_file, extracted_test_terms_file, train_feature_file, test_feature_file):
    import ioData as io
    df = io.readData(global_term_feature_file)

    cvalRes = None
    with open(extracted_test_terms_file) as fp:
        cvalRes = json.load(fp, encoding="utf-8")

    test_urls = cvalRes.keys()
    test_df = df.loc[df['doc_url'].isin(test_urls)]
    io.writeData(test_df, test_feature_file)

    train_df = df.loc[~df.index.isin(test_df.index)]
    io.writeData(train_df, train_feature_file)
def has_jwplayer(paragraphs):
    for p in paragraphs:
        s = sum(map(lambda x : x.count("jwplayer("), p))
        if s > 0:
            return True

    return False

if __name__ == "__main__":
    import ioData

    TRAIN_DATASET_FILE = "dataset/term-feature-train-dataset-v3.json"
    TEST_DATASET_FILE = "dataset/term-feature-test-dataset-v3.json"

    grapeshot_df = pd.read_excel("dataset/meta_keywords_overlaps.xlsx", "Overlaps")
    grapeshot_df = grapeshot_df[["URL", "Keywords"]]
    grapeshot_df.columns = ['url', 'keywords']
    grapeshot_df['keywords'] = grapeshot_df['keywords'].map(lambda x: x.lower())

    extracted_df = extract_test_keywords(TRAIN_DATASET_FILE, TEST_DATASET_FILE, retrain=True)

    # merge dataframes
    comparison_df = pd.merge(extracted_df, grapeshot_df, on='url')
    comparison_df['overlap'] = comparison_df.apply(extracted_keyterms_overlap, axis = 1)

    ioData.writeData(comparison_df, "dataset/comparison_df_v3.json")
    comparison_df.to_excel("dataset/comparison_df_v3.xlsx", "Overlap")

    print comparison_df.describe()