def import_rows(rows):
    CONN_STRING = configuration.get_connection_string()
    earmarks = []
    for row in rows:
        new_earmark = []
        for item in row:
            if isinstance(item, int):
                new_earmark.append(item)
            else:
                try:
                    new_item = item.decode('latin1').encode('utf8')
                    if len(new_item) >= 4096:
                        new_item = new_item[:4050]
                    new_earmark.append(new_item)
                except:
                    new_earmark.append(item)
        earmarks.append(new_earmark)

    conn = psycopg2.connect(CONN_STRING)
    cmd = "insert into earmark_documents (earmark_id, document_id, page_number, excerpt) values (%s, %s, %s, %s)"
    params = rows
    cur = conn.cursor()
    cur.executemany(cmd, earmarks)
    conn.commit()
    conn.close()
def import_rows(rows):
    CONN_STRING = configuration.get_connection_string()
    earmarks = []
    for row in rows:
        new_earmark = []
        for item in row:
            if isinstance(item,int):
                new_earmark.append(item)
            else:
                try:
                    new_item = item.decode('latin1').encode('utf8')
                    if len(new_item) >= 4096:
                        new_item = new_item[:4050]
                    new_earmark.append(new_item)
                except:
                    new_earmark.append(item)
        earmarks.append(new_earmark)

    conn = psycopg2.connect(CONN_STRING)
    cmd = "insert into earmark_documents (earmark_id, document_id, page_number, excerpt) values (%s, %s, %s, %s)"
    params = rows
    cur = conn.cursor()
    cur.executemany(cmd, earmarks)
    conn.commit()
    conn.close()
 def __init__(self, **kwargs):
     self.name = "wikipedia_categories_feature_generator"
     self.depth = kwargs.get("depth", 3)
     self.distinguish_levels = kwargs.get("distinguish_levels", True)
     self.force = kwargs.get("force", True)
     self.feature_prefix = "WIKI_CATEGORY_"
     self.NO_WIKI_PAGE_FEATURE = "NO_WIKIPEDIA_PAGE_WAS_FOUND"
     self.CONN_STRING = configuration.get_connection_string()
import os, sys, inspect
sys.path.insert(0, os.path.realpath(os.path.abspath(os.path.join(os.path.split(inspect.getfile( inspect.currentframe() ))[0],".."))))
from util import configuration
import psycopg2
import csv, pandas as pd
import codecs
from pprint import pprint

USAGE = "python %s <input-csv-file>" %(sys.argv[0])

CONN_STRING = configuration.get_connection_string()

def import_csv_file(path):
    rows  = []
    stuff = pd.read_csv(codecs.open(path,'r','utf-8'))
    i = 0
    for row in stuff.iterrows():
        v = list(row[1])
        i = i+1
        #print i
        if not isinstance(v[3],basestring) or not isinstance(v[2],basestring):
            v[3] = ''
            v[2] = ''
        rows.append(v)

    conn = psycopg2.connect(CONN_STRING)
    cmd = "insert into earmark_documents (earmark_id, document_id,page_number,excerpt)\
 values (%s, %s, %s, %s)"
    params = rows; pprint(rows); return
    cur = conn.cursor()
#r.execute ("delete from earmark_documents")
예제 #5
0
sys.path.insert(
    0,
    os.path.realpath(
        os.path.abspath(
            os.path.join(
                os.path.split(inspect.getfile(inspect.currentframe()))[0],
                ".."))))
from util import configuration
import psycopg2
import csv, pandas as pd
import codecs
from pprint import pprint

USAGE = "python %s <input-csv-file>" % (sys.argv[0])

CONN_STRING = configuration.get_connection_string()


def import_csv_file(path):
    rows = []
    stuff = pd.read_csv(codecs.open(path, 'r', 'utf-8'))
    i = 0
    for row in stuff.iterrows():
        v = list(row[1])
        i = i + 1
        #print i
        if not isinstance(v[3], basestring) or not isinstance(
                v[2], basestring):
            v[3] = ''
            v[2] = ''
        rows.append(v)
예제 #6
0
import os, sys, inspect
sys.path.insert(0, os.path.realpath(os.path.abspath(os.path.join(os.path.split(inspect.getfile( inspect.currentframe() ))[0],".."))))
import argparse

import os
import psycopg2
import psycopg2.extras
from  util import configuration
CONN_STRING =  configuration.get_connection_string()import multiprocessing as mp
import string
import re

import logging
from classification.pipe import Pipe
from classification.blocks_pipe import BlocksPipe
from classification.instances_grouper import InstancesGrouper

from classification.prepare_earmark_data import  serialize_instances, load_instances

from matching.feature_generators.jaccard_feature_generator import JaccardFeatureGenerator
from matching.feature_generators.ranking_feature_generator import RankingFeatureGenerator
from matching.feature_generators.difference_feature_generator import DifferenceFeatureGenerator
from matching.feature_generators.infix_feature_generator import InfixFeatureGenerator
from matching.feature_generators.table_feature_generator import TableFeatureGenerator

from matching.matching_util import *


logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
MIN = 0.1